jsimd_dspr2.S 140 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479
  1. /*
  2. * MIPS DSPr2 optimizations for libjpeg-turbo
  3. *
  4. * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
  5. * All Rights Reserved.
  6. * Authors: Teodora Novkovic <teodora.novkovic@imgtec.com>
  7. * Darko Laus <darko.laus@imgtec.com>
  8. * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
  9. *
  10. * This software is provided 'as-is', without any express or implied
  11. * warranty. In no event will the authors be held liable for any damages
  12. * arising from the use of this software.
  13. *
  14. * Permission is granted to anyone to use this software for any purpose,
  15. * including commercial applications, and to alter it and redistribute it
  16. * freely, subject to the following restrictions:
  17. *
  18. * 1. The origin of this software must not be misrepresented; you must not
  19. * claim that you wrote the original software. If you use this software
  20. * in a product, an acknowledgment in the product documentation would be
  21. * appreciated but is not required.
  22. * 2. Altered source versions must be plainly marked as such, and must not be
  23. * misrepresented as being the original software.
  24. * 3. This notice may not be removed or altered from any source distribution.
  25. */
  26. #include "jsimd_dspr2_asm.h"
  27. /*****************************************************************************/
  28. LEAF_DSPR2(jsimd_c_null_convert_dspr2)
  29. /*
  30. * a0 = cinfo->image_width
  31. * a1 = input_buf
  32. * a2 = output_buf
  33. * a3 = output_row
  34. * 16(sp) = num_rows
  35. * 20(sp) = cinfo->num_components
  36. *
  37. * Null conversion for compression
  38. */
  39. SAVE_REGS_ON_STACK 8, s0, s1
  40. lw t9, 24(sp) // t9 = num_rows
  41. lw s0, 28(sp) // s0 = cinfo->num_components
  42. andi t0, a0, 3 // t0 = cinfo->image_width & 3
  43. beqz t0, 4f // no residual
  44. nop
  45. 0:
  46. addiu t9, t9, -1
  47. bltz t9, 7f
  48. li t1, 0
  49. 1:
  50. sll t3, t1, 2
  51. lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
  52. lw t2, 0(a1) // t2 = inptr = *input_buf
  53. sll t4, a3, 2
  54. lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
  55. addu t2, t2, t1
  56. addu s1, t5, a0
  57. addu t6, t5, t0
  58. 2:
  59. lbu t3, 0(t2)
  60. addiu t5, t5, 1
  61. sb t3, -1(t5)
  62. bne t6, t5, 2b
  63. addu t2, t2, s0
  64. 3:
  65. lbu t3, 0(t2)
  66. addu t4, t2, s0
  67. addu t7, t4, s0
  68. addu t8, t7, s0
  69. addu t2, t8, s0
  70. lbu t4, 0(t4)
  71. lbu t7, 0(t7)
  72. lbu t8, 0(t8)
  73. addiu t5, t5, 4
  74. sb t3, -4(t5)
  75. sb t4, -3(t5)
  76. sb t7, -2(t5)
  77. bne s1, t5, 3b
  78. sb t8, -1(t5)
  79. addiu t1, t1, 1
  80. bne t1, s0, 1b
  81. nop
  82. addiu a1, a1, 4
  83. bgez t9, 0b
  84. addiu a3, a3, 1
  85. b 7f
  86. nop
  87. 4:
  88. addiu t9, t9, -1
  89. bltz t9, 7f
  90. li t1, 0
  91. 5:
  92. sll t3, t1, 2
  93. lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
  94. lw t2, 0(a1) // t2 = inptr = *input_buf
  95. sll t4, a3, 2
  96. lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
  97. addu t2, t2, t1
  98. addu s1, t5, a0
  99. addu t6, t5, t0
  100. 6:
  101. lbu t3, 0(t2)
  102. addu t4, t2, s0
  103. addu t7, t4, s0
  104. addu t8, t7, s0
  105. addu t2, t8, s0
  106. lbu t4, 0(t4)
  107. lbu t7, 0(t7)
  108. lbu t8, 0(t8)
  109. addiu t5, t5, 4
  110. sb t3, -4(t5)
  111. sb t4, -3(t5)
  112. sb t7, -2(t5)
  113. bne s1, t5, 6b
  114. sb t8, -1(t5)
  115. addiu t1, t1, 1
  116. bne t1, s0, 5b
  117. nop
  118. addiu a1, a1, 4
  119. bgez t9, 4b
  120. addiu a3, a3, 1
  121. 7:
  122. RESTORE_REGS_FROM_STACK 8, s0, s1
  123. j ra
  124. nop
  125. END(jsimd_c_null_convert_dspr2)
  126. /*****************************************************************************/
  127. /*
  128. * jsimd_extrgb_ycc_convert_dspr2
  129. * jsimd_extbgr_ycc_convert_dspr2
  130. * jsimd_extrgbx_ycc_convert_dspr2
  131. * jsimd_extbgrx_ycc_convert_dspr2
  132. * jsimd_extxbgr_ycc_convert_dspr2
  133. * jsimd_extxrgb_ycc_convert_dspr2
  134. *
  135. * Colorspace conversion RGB -> YCbCr
  136. */
  137. .macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 colorid, pixel_size, \
  138. r_offs, g_offs, b_offs
  139. .macro DO_RGB_TO_YCC r, g, b, inptr
  140. lbu \r, \r_offs(\inptr)
  141. lbu \g, \g_offs(\inptr)
  142. lbu \b, \b_offs(\inptr)
  143. addiu \inptr, \pixel_size
  144. .endm
  145. LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
  146. /*
  147. * a0 = cinfo->image_width
  148. * a1 = input_buf
  149. * a2 = output_buf
  150. * a3 = output_row
  151. * 16(sp) = num_rows
  152. */
  153. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  154. lw t7, 48(sp) // t7 = num_rows
  155. li s0, 0x4c8b // FIX(0.29900)
  156. li s1, 0x9646 // FIX(0.58700)
  157. li s2, 0x1d2f // FIX(0.11400)
  158. li s3, 0xffffd4cd // -FIX(0.16874)
  159. li s4, 0xffffab33 // -FIX(0.33126)
  160. li s5, 0x8000 // FIX(0.50000)
  161. li s6, 0xffff94d1 // -FIX(0.41869)
  162. li s7, 0xffffeb2f // -FIX(0.08131)
  163. li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1
  164. 0:
  165. addiu t7, -1 // --num_rows
  166. lw t6, 0(a1) // t6 = input_buf[0]
  167. lw t0, 0(a2)
  168. lw t1, 4(a2)
  169. lw t2, 8(a2)
  170. sll t3, a3, 2
  171. lwx t0, t3(t0) // t0 = output_buf[0][output_row]
  172. lwx t1, t3(t1) // t1 = output_buf[1][output_row]
  173. lwx t2, t3(t2) // t2 = output_buf[2][output_row]
  174. addu t9, t2, a0 // t9 = end address
  175. addiu a3, 1
  176. 1:
  177. DO_RGB_TO_YCC t3, t4, t5, t6
  178. mtlo s5, $ac0
  179. mtlo t8, $ac1
  180. mtlo t8, $ac2
  181. maddu $ac0, s2, t5
  182. maddu $ac1, s5, t5
  183. maddu $ac2, s5, t3
  184. maddu $ac0, s0, t3
  185. maddu $ac1, s3, t3
  186. maddu $ac2, s6, t4
  187. maddu $ac0, s1, t4
  188. maddu $ac1, s4, t4
  189. maddu $ac2, s7, t5
  190. extr.w t3, $ac0, 16
  191. extr.w t4, $ac1, 16
  192. extr.w t5, $ac2, 16
  193. sb t3, 0(t0)
  194. sb t4, 0(t1)
  195. sb t5, 0(t2)
  196. addiu t0, 1
  197. addiu t2, 1
  198. bne t2, t9, 1b
  199. addiu t1, 1
  200. bgtz t7, 0b
  201. addiu a1, 4
  202. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  203. j ra
  204. nop
  205. END(jsimd_\colorid\()_ycc_convert_dspr2)
  206. .purgem DO_RGB_TO_YCC
  207. .endm
  208. /*-------------------------------------id -- pix R G B */
  209. GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
  210. GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
  211. GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
  212. GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
  213. GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
  214. GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
  215. /*****************************************************************************/
  216. /*
  217. * jsimd_ycc_extrgb_convert_dspr2
  218. * jsimd_ycc_extbgr_convert_dspr2
  219. * jsimd_ycc_extrgbx_convert_dspr2
  220. * jsimd_ycc_extbgrx_convert_dspr2
  221. * jsimd_ycc_extxbgr_convert_dspr2
  222. * jsimd_ycc_extxrgb_convert_dspr2
  223. *
  224. * Colorspace conversion YCbCr -> RGB
  225. */
  226. .macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 colorid, pixel_size, \
  227. r_offs, g_offs, b_offs, a_offs
  228. .macro STORE_YCC_TO_RGB scratch0 scratch1 scratch2 outptr
  229. sb \scratch0, \r_offs(\outptr)
  230. sb \scratch1, \g_offs(\outptr)
  231. sb \scratch2, \b_offs(\outptr)
  232. .if (\pixel_size == 4)
  233. li t0, 0xFF
  234. sb t0, \a_offs(\outptr)
  235. .endif
  236. addiu \outptr, \pixel_size
  237. .endm
  238. LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
  239. /*
  240. * a0 = cinfo->image_width
  241. * a1 = input_buf
  242. * a2 = input_row
  243. * a3 = output_buf
  244. * 16(sp) = num_rows
  245. */
  246. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  247. lw s1, 48(sp)
  248. li t3, 0x8000
  249. li t4, 0x166e9 // FIX(1.40200)
  250. li t5, 0x1c5a2 // FIX(1.77200)
  251. li t6, 0xffff492e // -FIX(0.71414)
  252. li t7, 0xffffa7e6 // -FIX(0.34414)
  253. repl.ph t8, 128
  254. 0:
  255. lw s0, 0(a3)
  256. lw t0, 0(a1)
  257. lw t1, 4(a1)
  258. lw t2, 8(a1)
  259. sll s5, a2, 2
  260. addiu s1, -1
  261. lwx s2, s5(t0)
  262. lwx s3, s5(t1)
  263. lwx s4, s5(t2)
  264. addu t9, s2, a0
  265. addiu a2, 1
  266. 1:
  267. lbu s7, 0(s4) // cr
  268. lbu s6, 0(s3) // cb
  269. lbu s5, 0(s2) // y
  270. addiu s2, 1
  271. addiu s4, 1
  272. addiu s7, -128
  273. addiu s6, -128
  274. mul t2, t7, s6
  275. mul t0, t6, s7 // Crgtab[cr]
  276. sll s7, 15
  277. mulq_rs.w t1, t4, s7 // Crrtab[cr]
  278. sll s6, 15
  279. addu t2, t3 // Cbgtab[cb]
  280. addu t2, t0
  281. mulq_rs.w t0, t5, s6 // Cbbtab[cb]
  282. sra t2, 16
  283. addu t1, s5
  284. addu t2, s5 // add y
  285. ins t2, t1, 16, 16
  286. subu.ph t2, t2, t8
  287. addu t0, s5
  288. shll_s.ph t2, t2, 8
  289. subu t0, 128
  290. shra.ph t2, t2, 8
  291. shll_s.w t0, t0, 24
  292. addu.ph t2, t2, t8 // clip & store
  293. sra t0, t0, 24
  294. sra t1, t2, 16
  295. addiu t0, 128
  296. STORE_YCC_TO_RGB t1, t2, t0, s0
  297. bne s2, t9, 1b
  298. addiu s3, 1
  299. bgtz s1, 0b
  300. addiu a3, 4
  301. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  302. j ra
  303. nop
  304. END(jsimd_ycc_\colorid\()_convert_dspr2)
  305. .purgem STORE_YCC_TO_RGB
  306. .endm
  307. /*-------------------------------------id -- pix R G B A */
  308. GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb, 3, 0, 1, 2, 3
  309. GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr, 3, 2, 1, 0, 3
  310. GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
  311. GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
  312. GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
  313. GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0
  314. /*****************************************************************************/
  315. /*
  316. * jsimd_extrgb_gray_convert_dspr2
  317. * jsimd_extbgr_gray_convert_dspr2
  318. * jsimd_extrgbx_gray_convert_dspr2
  319. * jsimd_extbgrx_gray_convert_dspr2
  320. * jsimd_extxbgr_gray_convert_dspr2
  321. * jsimd_extxrgb_gray_convert_dspr2
  322. *
  323. * Colorspace conversion RGB -> GRAY
  324. */
  325. .macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 colorid, pixel_size, \
  326. r_offs, g_offs, b_offs
  327. .macro DO_RGB_TO_GRAY r, g, b, inptr
  328. lbu \r, \r_offs(\inptr)
  329. lbu \g, \g_offs(\inptr)
  330. lbu \b, \b_offs(\inptr)
  331. addiu \inptr, \pixel_size
  332. .endm
  333. LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
  334. /*
  335. * a0 = cinfo->image_width
  336. * a1 = input_buf
  337. * a2 = output_buf
  338. * a3 = output_row
  339. * 16(sp) = num_rows
  340. */
  341. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  342. li s0, 0x4c8b // s0 = FIX(0.29900)
  343. li s1, 0x9646 // s1 = FIX(0.58700)
  344. li s2, 0x1d2f // s2 = FIX(0.11400)
  345. li s7, 0x8000 // s7 = FIX(0.50000)
  346. lw s6, 48(sp)
  347. andi t7, a0, 3
  348. 0:
  349. addiu s6, -1 // s6 = num_rows
  350. lw t0, 0(a1)
  351. lw t1, 0(a2)
  352. sll t3, a3, 2
  353. lwx t1, t3(t1)
  354. addiu a3, 1
  355. addu t9, t1, a0
  356. subu t8, t9, t7
  357. beq t1, t8, 2f
  358. nop
  359. 1:
  360. DO_RGB_TO_GRAY t3, t4, t5, t0
  361. DO_RGB_TO_GRAY s3, s4, s5, t0
  362. mtlo s7, $ac0
  363. maddu $ac0, s2, t5
  364. maddu $ac0, s1, t4
  365. maddu $ac0, s0, t3
  366. mtlo s7, $ac1
  367. maddu $ac1, s2, s5
  368. maddu $ac1, s1, s4
  369. maddu $ac1, s0, s3
  370. extr.w t6, $ac0, 16
  371. DO_RGB_TO_GRAY t3, t4, t5, t0
  372. DO_RGB_TO_GRAY s3, s4, s5, t0
  373. mtlo s7, $ac0
  374. maddu $ac0, s2, t5
  375. maddu $ac0, s1, t4
  376. extr.w t2, $ac1, 16
  377. maddu $ac0, s0, t3
  378. mtlo s7, $ac1
  379. maddu $ac1, s2, s5
  380. maddu $ac1, s1, s4
  381. maddu $ac1, s0, s3
  382. extr.w t5, $ac0, 16
  383. sb t6, 0(t1)
  384. sb t2, 1(t1)
  385. extr.w t3, $ac1, 16
  386. addiu t1, 4
  387. sb t5, -2(t1)
  388. sb t3, -1(t1)
  389. bne t1, t8, 1b
  390. nop
  391. 2:
  392. beqz t7, 4f
  393. nop
  394. 3:
  395. DO_RGB_TO_GRAY t3, t4, t5, t0
  396. mtlo s7, $ac0
  397. maddu $ac0, s2, t5
  398. maddu $ac0, s1, t4
  399. maddu $ac0, s0, t3
  400. extr.w t6, $ac0, 16
  401. sb t6, 0(t1)
  402. addiu t1, 1
  403. bne t1, t9, 3b
  404. nop
  405. 4:
  406. bgtz s6, 0b
  407. addiu a1, 4
  408. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  409. j ra
  410. nop
  411. END(jsimd_\colorid\()_gray_convert_dspr2)
  412. .purgem DO_RGB_TO_GRAY
  413. .endm
  414. /*-------------------------------------id -- pix R G B */
  415. GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
  416. GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
  417. GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
  418. GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
  419. GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
  420. GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
  421. /*****************************************************************************/
  422. /*
  423. * jsimd_h2v2_merged_upsample_dspr2
  424. * jsimd_h2v2_extrgb_merged_upsample_dspr2
  425. * jsimd_h2v2_extrgbx_merged_upsample_dspr2
  426. * jsimd_h2v2_extbgr_merged_upsample_dspr2
  427. * jsimd_h2v2_extbgrx_merged_upsample_dspr2
  428. * jsimd_h2v2_extxbgr_merged_upsample_dspr2
  429. * jsimd_h2v2_extxrgb_merged_upsample_dspr2
  430. *
  431. * Merged h2v2 upsample routines
  432. */
  433. .macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
  434. r1_offs, g1_offs, \
  435. b1_offs, a1_offs, \
  436. r2_offs, g2_offs, \
  437. b2_offs, a2_offs
  438. .macro STORE_H2V2_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
  439. scratch5 outptr
  440. sb \scratch0, \r1_offs(\outptr)
  441. sb \scratch1, \g1_offs(\outptr)
  442. sb \scratch2, \b1_offs(\outptr)
  443. sb \scratch3, \r2_offs(\outptr)
  444. sb \scratch4, \g2_offs(\outptr)
  445. sb \scratch5, \b2_offs(\outptr)
  446. .if (\pixel_size == 8)
  447. li \scratch0, 0xFF
  448. sb \scratch0, \a1_offs(\outptr)
  449. sb \scratch0, \a2_offs(\outptr)
  450. .endif
  451. addiu \outptr, \pixel_size
  452. .endm
  453. .macro STORE_H2V2_1_PIXEL scratch0 scratch1 scratch2 outptr
  454. sb \scratch0, \r1_offs(\outptr)
  455. sb \scratch1, \g1_offs(\outptr)
  456. sb \scratch2, \b1_offs(\outptr)
  457. .if (\pixel_size == 8)
  458. li t0, 0xFF
  459. sb t0, \a1_offs(\outptr)
  460. .endif
  461. .endm
  462. LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
  463. /*
  464. * a0 = cinfo->output_width
  465. * a1 = input_buf
  466. * a2 = in_row_group_ctr
  467. * a3 = output_buf
  468. * 16(sp) = cinfo->sample_range_limit
  469. */
  470. SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
  471. lw t9, 56(sp) // cinfo->sample_range_limit
  472. lw v0, 0(a1)
  473. lw v1, 4(a1)
  474. lw t0, 8(a1)
  475. sll t1, a2, 3
  476. addiu t2, t1, 4
  477. sll t3, a2, 2
  478. lw t4, 0(a3) // t4 = output_buf[0]
  479. lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2]
  480. lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1]
  481. lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr]
  482. lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr]
  483. lw t7, 4(a3) // t7 = output_buf[1]
  484. li s1, 0xe6ea
  485. addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)]
  486. addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)]
  487. addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
  488. xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
  489. srl t3, a0, 1
  490. blez t3, 2f
  491. addu t0, t5, t3 // t0 = end address
  492. 1:
  493. lbu t3, 0(t5)
  494. lbu s3, 0(t6)
  495. addiu t5, t5, 1
  496. addiu t3, t3, -128 // (cb - 128)
  497. addiu s3, s3, -128 // (cr - 128)
  498. mult $ac1, s1, t3
  499. madd $ac1, s2, s3
  500. sll s3, s3, 15
  501. sll t3, t3, 15
  502. mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
  503. extr_r.w s5, $ac1, 16
  504. mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
  505. lbu v0, 0(t1)
  506. addiu t6, t6, 1
  507. addiu t1, t1, 2
  508. addu t3, v0, s4 // y+cred
  509. addu s3, v0, s5 // y+cgreen
  510. addu v1, v0, s6 // y+cblue
  511. addu t3, t9, t3 // y+cred
  512. addu s3, t9, s3 // y+cgreen
  513. addu v1, t9, v1 // y+cblue
  514. lbu AT, 0(t3)
  515. lbu s7, 0(s3)
  516. lbu ra, 0(v1)
  517. lbu v0, -1(t1)
  518. addu t3, v0, s4 // y+cred
  519. addu s3, v0, s5 // y+cgreen
  520. addu v1, v0, s6 // y+cblue
  521. addu t3, t9, t3 // y+cred
  522. addu s3, t9, s3 // y+cgreen
  523. addu v1, t9, v1 // y+cblue
  524. lbu t3, 0(t3)
  525. lbu s3, 0(s3)
  526. lbu v1, 0(v1)
  527. lbu v0, 0(t2)
  528. STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
  529. addu t3, v0, s4 // y+cred
  530. addu s3, v0, s5 // y+cgreen
  531. addu v1, v0, s6 // y+cblue
  532. addu t3, t9, t3 // y+cred
  533. addu s3, t9, s3 // y+cgreen
  534. addu v1, t9, v1 // y+cblue
  535. lbu AT, 0(t3)
  536. lbu s7, 0(s3)
  537. lbu ra, 0(v1)
  538. lbu v0, 1(t2)
  539. addiu t2, t2, 2
  540. addu t3, v0, s4 // y+cred
  541. addu s3, v0, s5 // y+cgreen
  542. addu v1, v0, s6 // y+cblue
  543. addu t3, t9, t3 // y+cred
  544. addu s3, t9, s3 // y+cgreen
  545. addu v1, t9, v1 // y+cblue
  546. lbu t3, 0(t3)
  547. lbu s3, 0(s3)
  548. lbu v1, 0(v1)
  549. STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
  550. bne t0, t5, 1b
  551. nop
  552. 2:
  553. andi t0, a0, 1
  554. beqz t0, 4f
  555. lbu t3, 0(t5)
  556. lbu s3, 0(t6)
  557. addiu t3, t3, -128 // (cb - 128)
  558. addiu s3, s3, -128 // (cr - 128)
  559. mult $ac1, s1, t3
  560. madd $ac1, s2, s3
  561. sll s3, s3, 15
  562. sll t3, t3, 15
  563. lbu v0, 0(t1)
  564. extr_r.w s5, $ac1, 16
  565. mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
  566. mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
  567. addu t3, v0, s4 // y+cred
  568. addu s3, v0, s5 // y+cgreen
  569. addu v1, v0, s6 // y+cblue
  570. addu t3, t9, t3 // y+cred
  571. addu s3, t9, s3 // y+cgreen
  572. addu v1, t9, v1 // y+cblue
  573. lbu t3, 0(t3)
  574. lbu s3, 0(s3)
  575. lbu v1, 0(v1)
  576. lbu v0, 0(t2)
  577. STORE_H2V2_1_PIXEL t3, s3, v1, t4
  578. addu t3, v0, s4 // y+cred
  579. addu s3, v0, s5 // y+cgreen
  580. addu v1, v0, s6 // y+cblue
  581. addu t3, t9, t3 // y+cred
  582. addu s3, t9, s3 // y+cgreen
  583. addu v1, t9, v1 // y+cblue
  584. lbu t3, 0(t3)
  585. lbu s3, 0(s3)
  586. lbu v1, 0(v1)
  587. STORE_H2V2_1_PIXEL t3, s3, v1, t7
  588. 4:
  589. RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
  590. j ra
  591. nop
  592. END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
  593. .purgem STORE_H2V2_1_PIXEL
  594. .purgem STORE_H2V2_2_PIXELS
  595. .endm
  596. /*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
  597. GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
  598. GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
  599. GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
  600. GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
  601. GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
  602. GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
  603. /*****************************************************************************/
  604. /*
  605. * jsimd_h2v1_merged_upsample_dspr2
  606. * jsimd_h2v1_extrgb_merged_upsample_dspr2
  607. * jsimd_h2v1_extrgbx_merged_upsample_dspr2
  608. * jsimd_h2v1_extbgr_merged_upsample_dspr2
  609. * jsimd_h2v1_extbgrx_merged_upsample_dspr2
  610. * jsimd_h2v1_extxbgr_merged_upsample_dspr2
  611. * jsimd_h2v1_extxrgb_merged_upsample_dspr2
  612. *
  613. * Merged h2v1 upsample routines
  614. */
  615. .macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
  616. r1_offs, g1_offs, \
  617. b1_offs, a1_offs, \
  618. r2_offs, g2_offs, \
  619. b2_offs, a2_offs
  620. .macro STORE_H2V1_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
  621. scratch5 outptr
  622. sb \scratch0, \r1_offs(\outptr)
  623. sb \scratch1, \g1_offs(\outptr)
  624. sb \scratch2, \b1_offs(\outptr)
  625. sb \scratch3, \r2_offs(\outptr)
  626. sb \scratch4, \g2_offs(\outptr)
  627. sb \scratch5, \b2_offs(\outptr)
  628. .if (\pixel_size == 8)
  629. li t0, 0xFF
  630. sb t0, \a1_offs(\outptr)
  631. sb t0, \a2_offs(\outptr)
  632. .endif
  633. addiu \outptr, \pixel_size
  634. .endm
  635. .macro STORE_H2V1_1_PIXEL scratch0 scratch1 scratch2 outptr
  636. sb \scratch0, \r1_offs(\outptr)
  637. sb \scratch1, \g1_offs(\outptr)
  638. sb \scratch2, \b1_offs(\outptr)
  639. .if (\pixel_size == 8)
  640. li t0, 0xFF
  641. sb t0, \a1_offs(\outptr)
  642. .endif
  643. .endm
  644. LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
  645. /*
  646. * a0 = cinfo->output_width
  647. * a1 = input_buf
  648. * a2 = in_row_group_ctr
  649. * a3 = output_buf
  650. * 16(sp) = range_limit
  651. */
  652. SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
  653. li t0, 0xe6ea
  654. lw t1, 0(a1) // t1 = input_buf[0]
  655. lw t2, 4(a1) // t2 = input_buf[1]
  656. lw t3, 8(a1) // t3 = input_buf[2]
  657. lw t8, 56(sp) // t8 = range_limit
  658. addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)]
  659. addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)]
  660. addiu s0, t0, 0x9916 // s0 = 0x8000
  661. addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
  662. xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
  663. srl t0, a0, 1
  664. sll t4, a2, 2
  665. lwx s5, t4(t1) // s5 = inptr0
  666. lwx s6, t4(t2) // s6 = inptr1
  667. lwx s7, t4(t3) // s7 = inptr2
  668. lw t7, 0(a3) // t7 = outptr
  669. blez t0, 2f
  670. addu t9, s6, t0 // t9 = end address
  671. 1:
  672. lbu t2, 0(s6) // t2 = cb
  673. lbu t0, 0(s7) // t0 = cr
  674. lbu t1, 0(s5) // t1 = y
  675. addiu t2, t2, -128 // t2 = cb - 128
  676. addiu t0, t0, -128 // t0 = cr - 128
  677. mult $ac1, s4, t2
  678. madd $ac1, s3, t0
  679. sll t0, t0, 15
  680. sll t2, t2, 15
  681. mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
  682. extr_r.w t5, $ac1, 16
  683. mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
  684. addiu s7, s7, 1
  685. addiu s6, s6, 1
  686. addu t2, t1, t0 // t2 = y + cred
  687. addu t3, t1, t5 // t3 = y + cgreen
  688. addu t4, t1, t6 // t4 = y + cblue
  689. addu t2, t8, t2
  690. addu t3, t8, t3
  691. addu t4, t8, t4
  692. lbu t1, 1(s5)
  693. lbu v0, 0(t2)
  694. lbu v1, 0(t3)
  695. lbu ra, 0(t4)
  696. addu t2, t1, t0
  697. addu t3, t1, t5
  698. addu t4, t1, t6
  699. addu t2, t8, t2
  700. addu t3, t8, t3
  701. addu t4, t8, t4
  702. lbu t2, 0(t2)
  703. lbu t3, 0(t3)
  704. lbu t4, 0(t4)
  705. STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
  706. bne t9, s6, 1b
  707. addiu s5, s5, 2
  708. 2:
  709. andi t0, a0, 1
  710. beqz t0, 4f
  711. nop
  712. 3:
  713. lbu t2, 0(s6)
  714. lbu t0, 0(s7)
  715. lbu t1, 0(s5)
  716. addiu t2, t2, -128 // (cb - 128)
  717. addiu t0, t0, -128 // (cr - 128)
  718. mul t3, s4, t2
  719. mul t4, s3, t0
  720. sll t0, t0, 15
  721. sll t2, t2, 15
  722. mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS
  723. mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS
  724. addu t3, t3, s0
  725. addu t3, t4, t3
  726. sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
  727. addu t2, t1, t0 // y + cred
  728. addu t3, t1, t5 // y + cgreen
  729. addu t4, t1, t6 // y + cblue
  730. addu t2, t8, t2
  731. addu t3, t8, t3
  732. addu t4, t8, t4
  733. lbu t2, 0(t2)
  734. lbu t3, 0(t3)
  735. lbu t4, 0(t4)
  736. STORE_H2V1_1_PIXEL t2, t3, t4, t7
  737. 4:
  738. RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
  739. j ra
  740. nop
  741. END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
  742. .purgem STORE_H2V1_1_PIXEL
  743. .purgem STORE_H2V1_2_PIXELS
  744. .endm
  745. /*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
  746. GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
  747. GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
  748. GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
  749. GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
  750. GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
  751. GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
  752. /*****************************************************************************/
  753. /*
  754. * jsimd_h2v2_fancy_upsample_dspr2
  755. *
  756. * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
  757. */
  758. LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
  759. /*
  760. * a0 = cinfo->max_v_samp_factor
  761. * a1 = downsampled_width
  762. * a2 = input_data
  763. * a3 = output_data_ptr
  764. */
  765. SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
  766. li s4, 0
  767. lw s2, 0(a3) // s2 = *output_data_ptr
  768. 0:
  769. li t9, 2
  770. lw s1, -4(a2) // s1 = inptr1
  771. 1:
  772. lw s0, 0(a2) // s0 = inptr0
  773. lwx s3, s4(s2)
  774. addiu s5, a1, -2 // s5 = downsampled_width - 2
  775. srl t4, s5, 1
  776. sll t4, t4, 1
  777. lbu t0, 0(s0)
  778. lbu t1, 1(s0)
  779. lbu t2, 0(s1)
  780. lbu t3, 1(s1)
  781. addiu s0, 2
  782. addiu s1, 2
  783. addu t8, s0, t4 // t8 = end address
  784. andi s5, s5, 1 // s5 = residual
  785. sll t4, t0, 1
  786. sll t6, t1, 1
  787. addu t0, t0, t4 // t0 = (*inptr0++) * 3
  788. addu t1, t1, t6 // t1 = (*inptr0++) * 3
  789. addu t7, t0, t2 // t7 = thiscolsum
  790. addu t6, t1, t3 // t5 = nextcolsum
  791. sll t0, t7, 2 // t0 = thiscolsum * 4
  792. subu t1, t0, t7 // t1 = thiscolsum * 3
  793. shra_r.w t0, t0, 4
  794. addiu t1, 7
  795. addu t1, t1, t6
  796. srl t1, t1, 4
  797. sb t0, 0(s3)
  798. sb t1, 1(s3)
  799. beq t8, s0, 22f // skip to final iteration if width == 3
  800. addiu s3, 2
  801. 2:
  802. lh t0, 0(s0) // t0 = A3|A2
  803. lh t2, 0(s1) // t2 = B3|B2
  804. addiu s0, 2
  805. addiu s1, 2
  806. preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2
  807. preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2
  808. shll.ph t1, t0, 1
  809. sll t3, t6, 1
  810. addu.ph t0, t1, t0 // t0 = A3*3|A2*3
  811. addu t3, t3, t6 // t3 = this * 3
  812. addu.ph t0, t0, t2 // t0 = next2|next1
  813. addu t1, t3, t7
  814. andi t7, t0, 0xFFFF // t7 = next1
  815. sll t2, t7, 1
  816. addu t2, t7, t2 // t2 = next1*3
  817. addu t4, t2, t6
  818. srl t6, t0, 16 // t6 = next2
  819. shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4
  820. addu t0, t3, t7
  821. addiu t0, 7
  822. srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4
  823. shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4
  824. addu t2, t2, t6
  825. addiu t2, 7
  826. srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4
  827. sb t1, 0(s3)
  828. sb t0, 1(s3)
  829. sb t4, 2(s3)
  830. sb t2, 3(s3)
  831. bne t8, s0, 2b
  832. addiu s3, 4
  833. 22:
  834. beqz s5, 4f
  835. addu t8, s0, s5
  836. 3:
  837. lbu t0, 0(s0)
  838. lbu t2, 0(s1)
  839. addiu s0, 1
  840. addiu s1, 1
  841. sll t3, t6, 1
  842. sll t1, t0, 1
  843. addu t1, t0, t1 // t1 = inptr0 * 3
  844. addu t3, t3, t6 // t3 = thiscolsum * 3
  845. addu t5, t1, t2
  846. addu t1, t3, t7
  847. shra_r.w t1, t1, 4
  848. addu t0, t3, t5
  849. addiu t0, 7
  850. srl t0, t0, 4
  851. sb t1, 0(s3)
  852. sb t0, 1(s3)
  853. addiu s3, 2
  854. move t7, t6
  855. bne t8, s0, 3b
  856. move t6, t5
  857. 4:
  858. sll t0, t6, 2 // t0 = thiscolsum * 4
  859. subu t1, t0, t6 // t1 = thiscolsum * 3
  860. addu t1, t1, t7
  861. addiu s4, 4
  862. shra_r.w t1, t1, 4
  863. addiu t0, 7
  864. srl t0, t0, 4
  865. sb t1, 0(s3)
  866. sb t0, 1(s3)
  867. addiu t9, -1
  868. addiu s3, 2
  869. bnez t9, 1b
  870. lw s1, 4(a2)
  871. srl t0, s4, 2
  872. subu t0, a0, t0
  873. bgtz t0, 0b
  874. addiu a2, 4
  875. RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
  876. j ra
  877. nop
  878. END(jsimd_h2v2_fancy_upsample_dspr2)
  879. /*****************************************************************************/
  880. LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
  881. /*
  882. * a0 = cinfo->max_v_samp_factor
  883. * a1 = downsampled_width
  884. * a2 = input_data
  885. * a3 = output_data_ptr
  886. */
  887. SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
  888. .set at
  889. beqz a0, 3f
  890. sll t0, a0, 2
  891. lw s1, 0(a3)
  892. li s3, 0x10001
  893. addu s0, s1, t0
  894. 0:
  895. addiu t8, a1, -2
  896. srl t9, t8, 2
  897. lw t7, 0(a2)
  898. lw s2, 0(s1)
  899. lbu t0, 0(t7)
  900. lbu t1, 1(t7) // t1 = inptr[1]
  901. sll t2, t0, 1
  902. addu t2, t2, t0 // t2 = invalue*3
  903. addu t2, t2, t1
  904. shra_r.w t2, t2, 2
  905. sb t0, 0(s2)
  906. sb t2, 1(s2)
  907. beqz t9, 11f
  908. addiu s2, 2
  909. 1:
  910. ulw t0, 0(t7) // t0 = |P3|P2|P1|P0|
  911. ulw t1, 1(t7)
  912. ulh t2, 4(t7) // t2 = |0|0|P5|P4|
  913. preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2|
  914. preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0|
  915. preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4|
  916. preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3|
  917. preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1|
  918. shll.ph t5, t4, 1
  919. shll.ph t6, t1, 1
  920. addu.ph t5, t5, t4 // t5 = |P4*3|P3*3|
  921. addu.ph t6, t6, t1 // t6 = |P2*3|P1*3|
  922. addu.ph t4, t3, s3
  923. addu.ph t0, t0, s3
  924. addu.ph t4, t4, t5
  925. addu.ph t0, t0, t6
  926. shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2|
  927. shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0|
  928. addu.ph t2, t2, t5
  929. addu.ph t3, t3, t6
  930. shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4|
  931. shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2|
  932. shll.ph t2, t2, 8
  933. shll.ph t3, t3, 8
  934. or t2, t4, t2
  935. or t3, t3, t0
  936. addiu t9, -1
  937. usw t3, 0(s2)
  938. usw t2, 4(s2)
  939. addiu s2, 8
  940. bgtz t9, 1b
  941. addiu t7, 4
  942. 11:
  943. andi t8, 3
  944. beqz t8, 22f
  945. addiu t7, 1
  946. 2:
  947. lbu t0, 0(t7)
  948. addiu t7, 1
  949. sll t1, t0, 1
  950. addu t2, t0, t1 // t2 = invalue
  951. lbu t3, -2(t7)
  952. lbu t4, 0(t7)
  953. addiu t3, 1
  954. addiu t4, 2
  955. addu t3, t3, t2
  956. addu t4, t4, t2
  957. srl t3, 2
  958. srl t4, 2
  959. sb t3, 0(s2)
  960. sb t4, 1(s2)
  961. addiu t8, -1
  962. bgtz t8, 2b
  963. addiu s2, 2
  964. 22:
  965. lbu t0, 0(t7)
  966. lbu t2, -1(t7)
  967. sll t1, t0, 1
  968. addu t1, t1, t0 // t1 = invalue * 3
  969. addu t1, t1, t2
  970. addiu t1, 1
  971. srl t1, t1, 2
  972. sb t1, 0(s2)
  973. sb t0, 1(s2)
  974. addiu s1, 4
  975. bne s1, s0, 0b
  976. addiu a2, 4
  977. 3:
  978. RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
  979. j ra
  980. nop
  981. END(jsimd_h2v1_fancy_upsample_dspr2)
  982. /*****************************************************************************/
  983. LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
  984. /*
  985. * a0 = cinfo->image_width
  986. * a1 = cinfo->max_v_samp_factor
  987. * a2 = compptr->v_samp_factor
  988. * a3 = compptr->width_in_blocks
  989. * 16(sp) = input_data
  990. * 20(sp) = output_data
  991. */
  992. .set at
  993. SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
  994. beqz a2, 7f
  995. lw s1, 44(sp) // s1 = output_data
  996. lw s0, 40(sp) // s0 = input_data
  997. srl s2, a0, 2
  998. andi t9, a0, 2
  999. srl t7, t9, 1
  1000. addu s2, t7, s2
  1001. sll t0, a3, 3 // t0 = width_in_blocks*DCT
  1002. srl t7, t0, 1
  1003. subu s2, t7, s2
  1004. 0:
  1005. andi t6, a0, 1 // t6 = temp_index
  1006. addiu t6, -1
  1007. lw t4, 0(s1) // t4 = outptr
  1008. lw t5, 0(s0) // t5 = inptr0
  1009. li s3, 0 // s3 = bias
  1010. srl t7, a0, 1 // t7 = image_width1
  1011. srl s4, t7, 2
  1012. andi t8, t7, 3
  1013. 1:
  1014. ulhu t0, 0(t5)
  1015. ulhu t1, 2(t5)
  1016. ulhu t2, 4(t5)
  1017. ulhu t3, 6(t5)
  1018. raddu.w.qb t0, t0
  1019. raddu.w.qb t1, t1
  1020. raddu.w.qb t2, t2
  1021. raddu.w.qb t3, t3
  1022. shra.ph t0, t0, 1
  1023. shra_r.ph t1, t1, 1
  1024. shra.ph t2, t2, 1
  1025. shra_r.ph t3, t3, 1
  1026. sb t0, 0(t4)
  1027. sb t1, 1(t4)
  1028. sb t2, 2(t4)
  1029. sb t3, 3(t4)
  1030. addiu s4, -1
  1031. addiu t4, 4
  1032. bgtz s4, 1b
  1033. addiu t5, 8
  1034. beqz t8, 3f
  1035. addu s4, t4, t8
  1036. 2:
  1037. ulhu t0, 0(t5)
  1038. raddu.w.qb t0, t0
  1039. addqh.w t0, t0, s3
  1040. xori s3, s3, 1
  1041. sb t0, 0(t4)
  1042. addiu t4, 1
  1043. bne t4, s4, 2b
  1044. addiu t5, 2
  1045. 3:
  1046. lbux t1, t6(t5)
  1047. sll t1, 1
  1048. addqh.w t2, t1, s3 // t2 = pixval1
  1049. xori s3, s3, 1
  1050. addqh.w t3, t1, s3 // t3 = pixval2
  1051. blez s2, 5f
  1052. append t3, t2, 8
  1053. addu t5, t4, s2 // t5 = loop_end2
  1054. 4:
  1055. ush t3, 0(t4)
  1056. addiu s2, -1
  1057. bgtz s2, 4b
  1058. addiu t4, 2
  1059. 5:
  1060. beqz t9, 6f
  1061. nop
  1062. sb t2, 0(t4)
  1063. 6:
  1064. addiu s1, 4
  1065. addiu a2, -1
  1066. bnez a2, 0b
  1067. addiu s0, 4
  1068. 7:
  1069. RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
  1070. j ra
  1071. nop
  1072. END(jsimd_h2v1_downsample_dspr2)
  1073. /*****************************************************************************/
  1074. LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
  1075. /*
  1076. * a0 = cinfo->image_width
  1077. * a1 = cinfo->max_v_samp_factor
  1078. * a2 = compptr->v_samp_factor
  1079. * a3 = compptr->width_in_blocks
  1080. * 16(sp) = input_data
  1081. * 20(sp) = output_data
  1082. */
  1083. .set at
  1084. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1085. beqz a2, 8f
  1086. lw s1, 52(sp) // s1 = output_data
  1087. lw s0, 48(sp) // s0 = input_data
  1088. andi t6, a0, 1 // t6 = temp_index
  1089. addiu t6, -1
  1090. srl t7, a0, 1 // t7 = image_width1
  1091. srl s4, t7, 2
  1092. andi t8, t7, 3
  1093. andi t9, a0, 2
  1094. srl s2, a0, 2
  1095. srl t7, t9, 1
  1096. addu s2, t7, s2
  1097. sll t0, a3, 3 // s2 = width_in_blocks*DCT
  1098. srl t7, t0, 1
  1099. subu s2, t7, s2
  1100. 0:
  1101. lw t4, 0(s1) // t4 = outptr
  1102. lw t5, 0(s0) // t5 = inptr0
  1103. lw s7, 4(s0) // s7 = inptr1
  1104. li s6, 1 // s6 = bias
  1105. 2:
  1106. ulw t0, 0(t5) // t0 = |P3|P2|P1|P0|
  1107. ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0|
  1108. ulw t2, 4(t5)
  1109. ulw t3, 4(s7)
  1110. precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2|
  1111. ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0|
  1112. raddu.w.qb t1, t7
  1113. raddu.w.qb t0, t0
  1114. shra_r.w t1, t1, 2
  1115. addiu t0, 1
  1116. srl t0, 2
  1117. precrq.ph.w t7, t2, t3
  1118. ins t2, t3, 16, 16
  1119. raddu.w.qb t7, t7
  1120. raddu.w.qb t2, t2
  1121. shra_r.w t7, t7, 2
  1122. addiu t2, 1
  1123. srl t2, 2
  1124. sb t0, 0(t4)
  1125. sb t1, 1(t4)
  1126. sb t2, 2(t4)
  1127. sb t7, 3(t4)
  1128. addiu t4, 4
  1129. addiu t5, 8
  1130. addiu s4, s4, -1
  1131. bgtz s4, 2b
  1132. addiu s7, 8
  1133. beqz t8, 4f
  1134. addu t8, t4, t8
  1135. 3:
  1136. ulhu t0, 0(t5)
  1137. ulhu t1, 0(s7)
  1138. ins t0, t1, 16, 16
  1139. raddu.w.qb t0, t0
  1140. addu t0, t0, s6
  1141. srl t0, 2
  1142. xori s6, s6, 3
  1143. sb t0, 0(t4)
  1144. addiu t5, 2
  1145. addiu t4, 1
  1146. bne t8, t4, 3b
  1147. addiu s7, 2
  1148. 4:
  1149. lbux t1, t6(t5)
  1150. sll t1, 1
  1151. lbux t0, t6(s7)
  1152. sll t0, 1
  1153. addu t1, t1, t0
  1154. addu t3, t1, s6
  1155. srl t0, t3, 2 // t2 = pixval1
  1156. xori s6, s6, 3
  1157. addu t2, t1, s6
  1158. srl t1, t2, 2 // t3 = pixval2
  1159. blez s2, 6f
  1160. append t1, t0, 8
  1161. 5:
  1162. ush t1, 0(t4)
  1163. addiu s2, -1
  1164. bgtz s2, 5b
  1165. addiu t4, 2
  1166. 6:
  1167. beqz t9, 7f
  1168. nop
  1169. sb t0, 0(t4)
  1170. 7:
  1171. addiu s1, 4
  1172. addiu a2, -1
  1173. bnez a2, 0b
  1174. addiu s0, 8
  1175. 8:
  1176. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1177. j ra
  1178. nop
  1179. END(jsimd_h2v2_downsample_dspr2)
  1180. /*****************************************************************************/
  1181. LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
  1182. /*
  1183. * a0 = input_data
  1184. * a1 = output_data
  1185. * a2 = compptr->v_samp_factor
  1186. * a3 = cinfo->max_v_samp_factor
  1187. * 16(sp) = cinfo->smoothing_factor
  1188. * 20(sp) = compptr->width_in_blocks
  1189. * 24(sp) = cinfo->image_width
  1190. */
  1191. .set at
  1192. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1193. lw s7, 52(sp) // compptr->width_in_blocks
  1194. lw s0, 56(sp) // cinfo->image_width
  1195. lw s6, 48(sp) // cinfo->smoothing_factor
  1196. sll s7, 3 // output_cols = width_in_blocks * DCTSIZE
  1197. sll v0, s7, 1
  1198. subu v0, v0, s0
  1199. blez v0, 2f
  1200. move v1, zero
  1201. addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2
  1202. 0:
  1203. addiu t1, a0, -4
  1204. sll t2, v1, 2
  1205. lwx t1, t2(t1)
  1206. move t3, v0
  1207. addu t1, t1, s0
  1208. lbu t2, -1(t1)
  1209. 1:
  1210. addiu t3, t3, -1
  1211. sb t2, 0(t1)
  1212. bgtz t3, 1b
  1213. addiu t1, t1, 1
  1214. addiu v1, v1, 1
  1215. bne v1, t0, 0b
  1216. nop
  1217. 2:
  1218. li v0, 80
  1219. mul v0, s6, v0
  1220. li v1, 16384
  1221. move t4, zero
  1222. move t5, zero
  1223. subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80
  1224. sll t7, s6, 4 // t7 = tmp_smoot_f * 16
  1225. 3:
  1226. /* Special case for first column: pretend column -1 is same as column 0 */
  1227. sll v0, t4, 2
  1228. lwx t8, v0(a1) // outptr = output_data[outrow]
  1229. sll v1, t5, 2
  1230. addiu t9, v1, 4
  1231. addiu s0, v1, -4
  1232. addiu s1, v1, 8
  1233. lwx s2, v1(a0) // inptr0 = input_data[inrow]
  1234. lwx t9, t9(a0) // inptr1 = input_data[inrow+1]
  1235. lwx s0, s0(a0) // above_ptr = input_data[inrow-1]
  1236. lwx s1, s1(a0) // below_ptr = input_data[inrow+2]
  1237. lh v0, 0(s2)
  1238. lh v1, 0(t9)
  1239. lh t0, 0(s0)
  1240. lh t1, 0(s1)
  1241. ins v0, v1, 16, 16
  1242. ins t0, t1, 16, 16
  1243. raddu.w.qb t2, v0
  1244. raddu.w.qb s3, t0
  1245. lbu v0, 0(s2)
  1246. lbu v1, 2(s2)
  1247. lbu t0, 0(t9)
  1248. lbu t1, 2(t9)
  1249. addu v0, v0, v1
  1250. mult $ac1, t2, t6
  1251. addu t0, t0, t1
  1252. lbu t2, 2(s0)
  1253. addu t0, t0, v0
  1254. lbu t3, 2(s1)
  1255. addu s3, t0, s3
  1256. lbu v0, 0(s0)
  1257. lbu t0, 0(s1)
  1258. sll s3, s3, 1
  1259. addu v0, v0, t2
  1260. addu t0, t0, t3
  1261. addu t0, t0, v0
  1262. addu s3, t0, s3
  1263. madd $ac1, s3, t7
  1264. extr_r.w v0, $ac1, 16
  1265. addiu t8, t8, 1
  1266. addiu s2, s2, 2
  1267. addiu t9, t9, 2
  1268. addiu s0, s0, 2
  1269. addiu s1, s1, 2
  1270. sb v0, -1(t8)
  1271. addiu s4, s7, -2
  1272. and s4, s4, 3
  1273. addu s5, s4, t8 // end address
  1274. 4:
  1275. lh v0, 0(s2)
  1276. lh v1, 0(t9)
  1277. lh t0, 0(s0)
  1278. lh t1, 0(s1)
  1279. ins v0, v1, 16, 16
  1280. ins t0, t1, 16, 16
  1281. raddu.w.qb t2, v0
  1282. raddu.w.qb s3, t0
  1283. lbu v0, -1(s2)
  1284. lbu v1, 2(s2)
  1285. lbu t0, -1(t9)
  1286. lbu t1, 2(t9)
  1287. addu v0, v0, v1
  1288. mult $ac1, t2, t6
  1289. addu t0, t0, t1
  1290. lbu t2, 2(s0)
  1291. addu t0, t0, v0
  1292. lbu t3, 2(s1)
  1293. addu s3, t0, s3
  1294. lbu v0, -1(s0)
  1295. lbu t0, -1(s1)
  1296. sll s3, s3, 1
  1297. addu v0, v0, t2
  1298. addu t0, t0, t3
  1299. addu t0, t0, v0
  1300. addu s3, t0, s3
  1301. madd $ac1, s3, t7
  1302. extr_r.w t2, $ac1, 16
  1303. addiu t8, t8, 1
  1304. addiu s2, s2, 2
  1305. addiu t9, t9, 2
  1306. addiu s0, s0, 2
  1307. sb t2, -1(t8)
  1308. bne s5, t8, 4b
  1309. addiu s1, s1, 2
  1310. addiu s5, s7, -2
  1311. subu s5, s5, s4
  1312. addu s5, s5, t8 // end address
  1313. 5:
  1314. lh v0, 0(s2)
  1315. lh v1, 0(t9)
  1316. lh t0, 0(s0)
  1317. lh t1, 0(s1)
  1318. ins v0, v1, 16, 16
  1319. ins t0, t1, 16, 16
  1320. raddu.w.qb t2, v0
  1321. raddu.w.qb s3, t0
  1322. lbu v0, -1(s2)
  1323. lbu v1, 2(s2)
  1324. lbu t0, -1(t9)
  1325. lbu t1, 2(t9)
  1326. addu v0, v0, v1
  1327. mult $ac1, t2, t6
  1328. addu t0, t0, t1
  1329. lbu t2, 2(s0)
  1330. addu t0, t0, v0
  1331. lbu t3, 2(s1)
  1332. addu s3, t0, s3
  1333. lbu v0, -1(s0)
  1334. lbu t0, -1(s1)
  1335. sll s3, s3, 1
  1336. addu v0, v0, t2
  1337. addu t0, t0, t3
  1338. lh v1, 2(t9)
  1339. addu t0, t0, v0
  1340. lh v0, 2(s2)
  1341. addu s3, t0, s3
  1342. lh t0, 2(s0)
  1343. lh t1, 2(s1)
  1344. madd $ac1, s3, t7
  1345. extr_r.w t2, $ac1, 16
  1346. ins t0, t1, 16, 16
  1347. ins v0, v1, 16, 16
  1348. raddu.w.qb s3, t0
  1349. lbu v1, 4(s2)
  1350. lbu t0, 1(t9)
  1351. lbu t1, 4(t9)
  1352. sb t2, 0(t8)
  1353. raddu.w.qb t3, v0
  1354. lbu v0, 1(s2)
  1355. addu t0, t0, t1
  1356. mult $ac1, t3, t6
  1357. addu v0, v0, v1
  1358. lbu t2, 4(s0)
  1359. addu t0, t0, v0
  1360. lbu v0, 1(s0)
  1361. addu s3, t0, s3
  1362. lbu t0, 1(s1)
  1363. lbu t3, 4(s1)
  1364. addu v0, v0, t2
  1365. sll s3, s3, 1
  1366. addu t0, t0, t3
  1367. lh v1, 4(t9)
  1368. addu t0, t0, v0
  1369. lh v0, 4(s2)
  1370. addu s3, t0, s3
  1371. lh t0, 4(s0)
  1372. lh t1, 4(s1)
  1373. madd $ac1, s3, t7
  1374. extr_r.w t2, $ac1, 16
  1375. ins t0, t1, 16, 16
  1376. ins v0, v1, 16, 16
  1377. raddu.w.qb s3, t0
  1378. lbu v1, 6(s2)
  1379. lbu t0, 3(t9)
  1380. lbu t1, 6(t9)
  1381. sb t2, 1(t8)
  1382. raddu.w.qb t3, v0
  1383. lbu v0, 3(s2)
  1384. addu t0, t0, t1
  1385. mult $ac1, t3, t6
  1386. addu v0, v0, v1
  1387. lbu t2, 6(s0)
  1388. addu t0, t0, v0
  1389. lbu v0, 3(s0)
  1390. addu s3, t0, s3
  1391. lbu t0, 3(s1)
  1392. lbu t3, 6(s1)
  1393. addu v0, v0, t2
  1394. sll s3, s3, 1
  1395. addu t0, t0, t3
  1396. lh v1, 6(t9)
  1397. addu t0, t0, v0
  1398. lh v0, 6(s2)
  1399. addu s3, t0, s3
  1400. lh t0, 6(s0)
  1401. lh t1, 6(s1)
  1402. madd $ac1, s3, t7
  1403. extr_r.w t3, $ac1, 16
  1404. ins t0, t1, 16, 16
  1405. ins v0, v1, 16, 16
  1406. raddu.w.qb s3, t0
  1407. lbu v1, 8(s2)
  1408. lbu t0, 5(t9)
  1409. lbu t1, 8(t9)
  1410. sb t3, 2(t8)
  1411. raddu.w.qb t2, v0
  1412. lbu v0, 5(s2)
  1413. addu t0, t0, t1
  1414. mult $ac1, t2, t6
  1415. addu v0, v0, v1
  1416. lbu t2, 8(s0)
  1417. addu t0, t0, v0
  1418. lbu v0, 5(s0)
  1419. addu s3, t0, s3
  1420. lbu t0, 5(s1)
  1421. lbu t3, 8(s1)
  1422. addu v0, v0, t2
  1423. sll s3, s3, 1
  1424. addu t0, t0, t3
  1425. addiu t8, t8, 4
  1426. addu t0, t0, v0
  1427. addiu s2, s2, 8
  1428. addu s3, t0, s3
  1429. addiu t9, t9, 8
  1430. madd $ac1, s3, t7
  1431. extr_r.w t1, $ac1, 16
  1432. addiu s0, s0, 8
  1433. addiu s1, s1, 8
  1434. bne s5, t8, 5b
  1435. sb t1, -1(t8)
  1436. /* Special case for last column */
  1437. lh v0, 0(s2)
  1438. lh v1, 0(t9)
  1439. lh t0, 0(s0)
  1440. lh t1, 0(s1)
  1441. ins v0, v1, 16, 16
  1442. ins t0, t1, 16, 16
  1443. raddu.w.qb t2, v0
  1444. raddu.w.qb s3, t0
  1445. lbu v0, -1(s2)
  1446. lbu v1, 1(s2)
  1447. lbu t0, -1(t9)
  1448. lbu t1, 1(t9)
  1449. addu v0, v0, v1
  1450. mult $ac1, t2, t6
  1451. addu t0, t0, t1
  1452. lbu t2, 1(s0)
  1453. addu t0, t0, v0
  1454. lbu t3, 1(s1)
  1455. addu s3, t0, s3
  1456. lbu v0, -1(s0)
  1457. lbu t0, -1(s1)
  1458. sll s3, s3, 1
  1459. addu v0, v0, t2
  1460. addu t0, t0, t3
  1461. addu t0, t0, v0
  1462. addu s3, t0, s3
  1463. madd $ac1, s3, t7
  1464. extr_r.w t0, $ac1, 16
  1465. addiu t5, t5, 2
  1466. sb t0, 0(t8)
  1467. addiu t4, t4, 1
  1468. bne t4, a2, 3b
  1469. addiu t5, t5, 2
  1470. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1471. j ra
  1472. nop
  1473. END(jsimd_h2v2_smooth_downsample_dspr2)
  1474. /*****************************************************************************/
  1475. LEAF_DSPR2(jsimd_int_upsample_dspr2)
  1476. /*
  1477. * a0 = upsample->h_expand[compptr->component_index]
  1478. * a1 = upsample->v_expand[compptr->component_index]
  1479. * a2 = input_data
  1480. * a3 = output_data_ptr
  1481. * 16(sp) = cinfo->output_width
  1482. * 20(sp) = cinfo->max_v_samp_factor
  1483. */
  1484. .set at
  1485. SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
  1486. lw s0, 0(a3) // s0 = output_data
  1487. lw s1, 32(sp) // s1 = cinfo->output_width
  1488. lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor
  1489. li t6, 0 // t6 = inrow
  1490. beqz s2, 10f
  1491. li s3, 0 // s3 = outrow
  1492. 0:
  1493. addu t0, a2, t6
  1494. addu t7, s0, s3
  1495. lw t3, 0(t0) // t3 = inptr
  1496. lw t8, 0(t7) // t8 = outptr
  1497. beqz s1, 4f
  1498. addu t5, t8, s1 // t5 = outend
  1499. 1:
  1500. lb t2, 0(t3) // t2 = invalue = *inptr++
  1501. addiu t3, 1
  1502. beqz a0, 3f
  1503. move t0, a0 // t0 = h_expand
  1504. 2:
  1505. sb t2, 0(t8)
  1506. addiu t0, -1
  1507. bgtz t0, 2b
  1508. addiu t8, 1
  1509. 3:
  1510. bgt t5, t8, 1b
  1511. nop
  1512. 4:
  1513. addiu t9, a1, -1 // t9 = v_expand - 1
  1514. blez t9, 9f
  1515. nop
  1516. 5:
  1517. lw t3, 0(s0)
  1518. lw t4, 4(s0)
  1519. subu t0, s1, 0xF
  1520. blez t0, 7f
  1521. addu t5, t3, s1 // t5 = end address
  1522. andi t7, s1, 0xF // t7 = residual
  1523. subu t8, t5, t7
  1524. 6:
  1525. ulw t0, 0(t3)
  1526. ulw t1, 4(t3)
  1527. ulw t2, 8(t3)
  1528. usw t0, 0(t4)
  1529. ulw t0, 12(t3)
  1530. usw t1, 4(t4)
  1531. usw t2, 8(t4)
  1532. usw t0, 12(t4)
  1533. addiu t3, 16
  1534. bne t3, t8, 6b
  1535. addiu t4, 16
  1536. beqz t7, 8f
  1537. nop
  1538. 7:
  1539. lbu t0, 0(t3)
  1540. sb t0, 0(t4)
  1541. addiu t3, 1
  1542. bne t3, t5, 7b
  1543. addiu t4, 1
  1544. 8:
  1545. addiu t9, -1
  1546. bgtz t9, 5b
  1547. addiu s0, 8
  1548. 9:
  1549. addu s3, s3, a1
  1550. bne s3, s2, 0b
  1551. addiu t6, 1
  1552. 10:
  1553. RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
  1554. j ra
  1555. nop
  1556. END(jsimd_int_upsample_dspr2)
  1557. /*****************************************************************************/
  1558. LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
  1559. /*
  1560. * a0 = cinfo->max_v_samp_factor
  1561. * a1 = cinfo->output_width
  1562. * a2 = input_data
  1563. * a3 = output_data_ptr
  1564. */
  1565. lw t7, 0(a3) // t7 = output_data
  1566. andi t8, a1, 0xf // t8 = residual
  1567. sll t0, a0, 2
  1568. blez a0, 4f
  1569. addu t9, t7, t0 // t9 = output_data end address
  1570. 0:
  1571. lw t5, 0(t7) // t5 = outptr
  1572. lw t6, 0(a2) // t6 = inptr
  1573. addu t3, t5, a1 // t3 = outptr + output_width (end address)
  1574. subu t3, t8 // t3 = end address - residual
  1575. beq t5, t3, 2f
  1576. move t4, t8
  1577. 1:
  1578. ulw t0, 0(t6) // t0 = |P3|P2|P1|P0|
  1579. ulw t2, 4(t6) // t2 = |P7|P6|P5|P4|
  1580. srl t1, t0, 16 // t1 = |X|X|P3|P2|
  1581. ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0|
  1582. ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2|
  1583. ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0|
  1584. ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2|
  1585. usw t0, 0(t5)
  1586. usw t1, 4(t5)
  1587. srl t0, t2, 16 // t0 = |X|X|P7|P6|
  1588. ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4|
  1589. ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6|
  1590. ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4|
  1591. ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6|
  1592. usw t2, 8(t5)
  1593. usw t0, 12(t5)
  1594. addiu t5, 16
  1595. bne t5, t3, 1b
  1596. addiu t6, 8
  1597. beqz t8, 3f
  1598. move t4, t8
  1599. 2:
  1600. lbu t1, 0(t6)
  1601. sb t1, 0(t5)
  1602. sb t1, 1(t5)
  1603. addiu t4, -2
  1604. addiu t6, 1
  1605. bgtz t4, 2b
  1606. addiu t5, 2
  1607. 3:
  1608. addiu t7, 4
  1609. bne t9, t7, 0b
  1610. addiu a2, 4
  1611. 4:
  1612. j ra
  1613. nop
  1614. END(jsimd_h2v1_upsample_dspr2)
  1615. /*****************************************************************************/
  1616. LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
  1617. /*
  1618. * a0 = cinfo->max_v_samp_factor
  1619. * a1 = cinfo->output_width
  1620. * a2 = input_data
  1621. * a3 = output_data_ptr
  1622. */
  1623. lw t7, 0(a3)
  1624. blez a0, 7f
  1625. andi t9, a1, 0xf // t9 = residual
  1626. 0:
  1627. lw t6, 0(a2) // t6 = inptr
  1628. lw t5, 0(t7) // t5 = outptr
  1629. addu t8, t5, a1 // t8 = outptr end address
  1630. subu t8, t9 // t8 = end address - residual
  1631. beq t5, t8, 2f
  1632. move t4, t9
  1633. 1:
  1634. ulw t0, 0(t6)
  1635. srl t1, t0, 16
  1636. ins t0, t0, 16, 16
  1637. ins t0, t0, 8, 16
  1638. ins t1, t1, 16, 16
  1639. ins t1, t1, 8, 16
  1640. ulw t2, 4(t6)
  1641. usw t0, 0(t5)
  1642. usw t1, 4(t5)
  1643. srl t3, t2, 16
  1644. ins t2, t2, 16, 16
  1645. ins t2, t2, 8, 16
  1646. ins t3, t3, 16, 16
  1647. ins t3, t3, 8, 16
  1648. usw t2, 8(t5)
  1649. usw t3, 12(t5)
  1650. addiu t5, 16
  1651. bne t5, t8, 1b
  1652. addiu t6, 8
  1653. beqz t9, 3f
  1654. move t4, t9
  1655. 2:
  1656. lbu t0, 0(t6)
  1657. sb t0, 0(t5)
  1658. sb t0, 1(t5)
  1659. addiu t4, -2
  1660. addiu t6, 1
  1661. bgtz t4, 2b
  1662. addiu t5, 2
  1663. 3:
  1664. lw t6, 0(t7) // t6 = outptr[0]
  1665. lw t5, 4(t7) // t5 = outptr[1]
  1666. addu t4, t6, a1 // t4 = new end address
  1667. beq a1, t9, 5f
  1668. subu t8, t4, t9
  1669. 4:
  1670. ulw t0, 0(t6)
  1671. ulw t1, 4(t6)
  1672. ulw t2, 8(t6)
  1673. usw t0, 0(t5)
  1674. ulw t0, 12(t6)
  1675. usw t1, 4(t5)
  1676. usw t2, 8(t5)
  1677. usw t0, 12(t5)
  1678. addiu t6, 16
  1679. bne t6, t8, 4b
  1680. addiu t5, 16
  1681. beqz t9, 6f
  1682. nop
  1683. 5:
  1684. lbu t0, 0(t6)
  1685. sb t0, 0(t5)
  1686. addiu t6, 1
  1687. bne t6, t4, 5b
  1688. addiu t5, 1
  1689. 6:
  1690. addiu t7, 8
  1691. addiu a0, -2
  1692. bgtz a0, 0b
  1693. addiu a2, 4
  1694. 7:
  1695. j ra
  1696. nop
  1697. END(jsimd_h2v2_upsample_dspr2)
  1698. /*****************************************************************************/
  1699. LEAF_DSPR2(jsimd_idct_islow_dspr2)
  1700. /*
  1701. * a0 = coef_block
  1702. * a1 = compptr->dcttable
  1703. * a2 = output
  1704. * a3 = range_limit
  1705. */
  1706. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1707. addiu sp, sp, -256
  1708. move v0, sp
  1709. addiu v1, zero, 8 // v1 = DCTSIZE = 8
  1710. 1:
  1711. lh s4, 32(a0) // s4 = inptr[16]
  1712. lh s5, 64(a0) // s5 = inptr[32]
  1713. lh s6, 96(a0) // s6 = inptr[48]
  1714. lh t1, 112(a0) // t1 = inptr[56]
  1715. lh t7, 16(a0) // t7 = inptr[8]
  1716. lh t5, 80(a0) // t5 = inptr[40]
  1717. lh t3, 48(a0) // t3 = inptr[24]
  1718. or s4, s4, t1
  1719. or s4, s4, t3
  1720. or s4, s4, t5
  1721. or s4, s4, t7
  1722. or s4, s4, s5
  1723. or s4, s4, s6
  1724. bnez s4, 2f
  1725. addiu v1, v1, -1
  1726. lh s5, 0(a1) // quantptr[DCTSIZE*0]
  1727. lh s6, 0(a0) // inptr[DCTSIZE*0]
  1728. mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0])
  1729. sll s5, s5, 2
  1730. sw s5, 0(v0)
  1731. sw s5, 32(v0)
  1732. sw s5, 64(v0)
  1733. sw s5, 96(v0)
  1734. sw s5, 128(v0)
  1735. sw s5, 160(v0)
  1736. sw s5, 192(v0)
  1737. b 3f
  1738. sw s5, 224(v0)
  1739. 2:
  1740. lh t0, 112(a1)
  1741. lh t2, 48(a1)
  1742. lh t4, 80(a1)
  1743. lh t6, 16(a1)
  1744. mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7])
  1745. mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3])
  1746. mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5])
  1747. mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1])
  1748. lh t4, 32(a1)
  1749. lh t5, 32(a0)
  1750. lh t6, 96(a1)
  1751. lh t7, 96(a0)
  1752. addu s0, t0, t1 // z3 = tmp0 + tmp2
  1753. addu s1, t1, t2 // z2 = tmp1 + tmp2
  1754. addu s2, t2, t3 // z4 = tmp1 + tmp3
  1755. addu s3, s0, s2 // z3 + z4
  1756. addiu t9, zero, 9633 // FIX_1_175875602
  1757. mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
  1758. addu t8, t0, t3 // z1 = tmp0 + tmp3
  1759. addiu t9, zero, 2446 // FIX_0_298631336
  1760. mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
  1761. addiu t9, zero, 16819 // FIX_2_053119869
  1762. mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
  1763. addiu t9, zero, 25172 // FIX_3_072711026
  1764. mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
  1765. addiu t9, zero, 12299 // FIX_1_501321110
  1766. mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
  1767. addiu t9, zero, 16069 // FIX_1_961570560
  1768. mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560)
  1769. addiu t9, zero, 3196 // FIX_0_390180644
  1770. mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644)
  1771. addiu t9, zero, 7373 // FIX_0_899976223
  1772. mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223)
  1773. addiu t9, zero, 20995 // FIX_2_562915447
  1774. mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447)
  1775. subu s0, s3, s0 // z3 += z5
  1776. addu t0, t0, s0 // tmp0 += z3
  1777. addu t1, t1, s0 // tmp2 += z3
  1778. subu s2, s3, s2 // z4 += z5
  1779. addu t2, t2, s2 // tmp1 += z4
  1780. addu t3, t3, s2 // tmp3 += z4
  1781. subu t0, t0, t8 // tmp0 += z1
  1782. subu t1, t1, s1 // tmp2 += z2
  1783. subu t2, t2, s1 // tmp1 += z2
  1784. subu t3, t3, t8 // tmp3 += z1
  1785. mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
  1786. addiu t9, zero, 6270 // FIX_0_765366865
  1787. mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
  1788. lh t4, 0(a1)
  1789. lh t5, 0(a0)
  1790. lh t6, 64(a1)
  1791. lh t7, 64(a0)
  1792. mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865)
  1793. mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
  1794. mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
  1795. addiu t9, zero, 4433 // FIX_0_541196100
  1796. addu s3, s0, s1 // z2 + z3
  1797. mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
  1798. addiu t9, zero, 15137 // FIX_1_847759065
  1799. mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065)
  1800. addu t4, t5, t6
  1801. subu t5, t5, t6
  1802. sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS
  1803. sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS
  1804. addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
  1805. subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
  1806. addu s0, t4, t7
  1807. subu s1, t4, t7
  1808. addu s2, t5, t6
  1809. subu s3, t5, t6
  1810. addu t4, s0, t3
  1811. subu s0, s0, t3
  1812. addu t3, s2, t1
  1813. subu s2, s2, t1
  1814. addu t1, s3, t2
  1815. subu s3, s3, t2
  1816. addu t2, s1, t0
  1817. subu s1, s1, t0
  1818. shra_r.w t4, t4, 11
  1819. shra_r.w t3, t3, 11
  1820. shra_r.w t1, t1, 11
  1821. shra_r.w t2, t2, 11
  1822. shra_r.w s1, s1, 11
  1823. shra_r.w s3, s3, 11
  1824. shra_r.w s2, s2, 11
  1825. shra_r.w s0, s0, 11
  1826. sw t4, 0(v0)
  1827. sw t3, 32(v0)
  1828. sw t1, 64(v0)
  1829. sw t2, 96(v0)
  1830. sw s1, 128(v0)
  1831. sw s3, 160(v0)
  1832. sw s2, 192(v0)
  1833. sw s0, 224(v0)
  1834. 3:
  1835. addiu a1, a1, 2
  1836. addiu a0, a0, 2
  1837. bgtz v1, 1b
  1838. addiu v0, v0, 4
  1839. move v0, sp
  1840. addiu v1, zero, 8
  1841. 4:
  1842. lw t0, 8(v0) // z2 = (JLONG)wsptr[2]
  1843. lw t1, 24(v0) // z3 = (JLONG)wsptr[6]
  1844. lw t2, 0(v0) // (JLONG)wsptr[0]
  1845. lw t3, 16(v0) // (JLONG)wsptr[4]
  1846. lw s4, 4(v0) // (JLONG)wsptr[1]
  1847. lw s5, 12(v0) // (JLONG)wsptr[3]
  1848. lw s6, 20(v0) // (JLONG)wsptr[5]
  1849. lw s7, 28(v0) // (JLONG)wsptr[7]
  1850. or s4, s4, t0
  1851. or s4, s4, t1
  1852. or s4, s4, t3
  1853. or s4, s4, s7
  1854. or s4, s4, s5
  1855. or s4, s4, s6
  1856. bnez s4, 5f
  1857. addiu v1, v1, -1
  1858. shra_r.w s5, t2, 5
  1859. andi s5, s5, 0x3ff
  1860. lbux s5, s5(a3)
  1861. lw s1, 0(a2)
  1862. replv.qb s5, s5
  1863. usw s5, 0(s1)
  1864. usw s5, 4(s1)
  1865. b 6f
  1866. nop
  1867. 5:
  1868. addu t4, t0, t1 // z2 + z3
  1869. addiu t8, zero, 4433 // FIX_0_541196100
  1870. mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
  1871. addiu t8, zero, 15137 // FIX_1_847759065
  1872. mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065)
  1873. addiu t8, zero, 6270 // FIX_0_765366865
  1874. mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865)
  1875. addu t4, t2, t3 // (JLONG)wsptr[0] + (JLONG)wsptr[4]
  1876. subu t2, t2, t3 // (JLONG)wsptr[0] - (JLONG)wsptr[4]
  1877. sll t4, t4, 13 // tmp0 = (wsptr[0] + wsptr[4]) << CONST_BITS
  1878. sll t2, t2, 13 // tmp1 = (wsptr[0] - wsptr[4]) << CONST_BITS
  1879. subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
  1880. subu t3, t2, t1 // tmp12 = tmp1 - tmp2
  1881. addu t2, t2, t1 // tmp11 = tmp1 + tmp2
  1882. addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
  1883. subu t1, t4, t5 // tmp13 = tmp0 - tmp3
  1884. addu t0, t4, t5 // tmp10 = tmp0 + tmp3
  1885. lw t4, 28(v0) // tmp0 = (JLONG)wsptr[7]
  1886. lw t6, 12(v0) // tmp2 = (JLONG)wsptr[3]
  1887. lw t5, 20(v0) // tmp1 = (JLONG)wsptr[5]
  1888. lw t7, 4(v0) // tmp3 = (JLONG)wsptr[1]
  1889. addu s0, t4, t6 // z3 = tmp0 + tmp2
  1890. addiu t8, zero, 9633 // FIX_1_175875602
  1891. addu s1, t5, t7 // z4 = tmp1 + tmp3
  1892. addu s2, s0, s1 // z3 + z4
  1893. mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
  1894. addu s3, t4, t7 // z1 = tmp0 + tmp3
  1895. addu t9, t5, t6 // z2 = tmp1 + tmp2
  1896. addiu t8, zero, 16069 // FIX_1_961570560
  1897. mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560)
  1898. addiu t8, zero, 3196 // FIX_0_390180644
  1899. mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644)
  1900. addiu t8, zero, 2446 // FIX_0_298631336
  1901. mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
  1902. addiu t8, zero, 7373 // FIX_0_899976223
  1903. mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223)
  1904. addiu t8, zero, 16819 // FIX_2_053119869
  1905. mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
  1906. addiu t8, zero, 20995 // FIX_2_562915447
  1907. mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447)
  1908. addiu t8, zero, 25172 // FIX_3_072711026
  1909. mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
  1910. addiu t8, zero, 12299 // FIX_1_501321110
  1911. mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
  1912. subu s0, s2, s0 // z3 += z5
  1913. subu s1, s2, s1 // z4 += z5
  1914. addu t4, t4, s0
  1915. subu t4, t4, s3 // tmp0
  1916. addu t5, t5, s1
  1917. subu t5, t5, t9 // tmp1
  1918. addu t6, t6, s0
  1919. subu t6, t6, t9 // tmp2
  1920. addu t7, t7, s1
  1921. subu t7, t7, s3 // tmp3
  1922. addu s0, t0, t7
  1923. subu t0, t0, t7
  1924. addu t7, t2, t6
  1925. subu t2, t2, t6
  1926. addu t6, t3, t5
  1927. subu t3, t3, t5
  1928. addu t5, t1, t4
  1929. subu t1, t1, t4
  1930. shra_r.w s0, s0, 18
  1931. shra_r.w t7, t7, 18
  1932. shra_r.w t6, t6, 18
  1933. shra_r.w t5, t5, 18
  1934. shra_r.w t1, t1, 18
  1935. shra_r.w t3, t3, 18
  1936. shra_r.w t2, t2, 18
  1937. shra_r.w t0, t0, 18
  1938. andi s0, s0, 0x3ff
  1939. andi t7, t7, 0x3ff
  1940. andi t6, t6, 0x3ff
  1941. andi t5, t5, 0x3ff
  1942. andi t1, t1, 0x3ff
  1943. andi t3, t3, 0x3ff
  1944. andi t2, t2, 0x3ff
  1945. andi t0, t0, 0x3ff
  1946. lw s1, 0(a2)
  1947. lbux s0, s0(a3)
  1948. lbux t7, t7(a3)
  1949. lbux t6, t6(a3)
  1950. lbux t5, t5(a3)
  1951. lbux t1, t1(a3)
  1952. lbux t3, t3(a3)
  1953. lbux t2, t2(a3)
  1954. lbux t0, t0(a3)
  1955. sb s0, 0(s1)
  1956. sb t7, 1(s1)
  1957. sb t6, 2(s1)
  1958. sb t5, 3(s1)
  1959. sb t1, 4(s1)
  1960. sb t3, 5(s1)
  1961. sb t2, 6(s1)
  1962. sb t0, 7(s1)
  1963. 6:
  1964. addiu v0, v0, 32
  1965. bgtz v1, 4b
  1966. addiu a2, a2, 4
  1967. addiu sp, sp, 256
  1968. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1969. j ra
  1970. nop
  1971. END(jsimd_idct_islow_dspr2)
  1972. /*****************************************************************************/
  1973. LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
  1974. /*
  1975. * a0 = inptr
  1976. * a1 = quantptr
  1977. * a2 = wsptr
  1978. * a3 = mips_idct_ifast_coefs
  1979. */
  1980. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  1981. addiu t9, a0, 16 // end address
  1982. or AT, a3, zero
  1983. 0:
  1984. lw s0, 0(a1) // quantptr[DCTSIZE*0]
  1985. lw t0, 0(a0) // inptr[DCTSIZE*0]
  1986. lw t1, 16(a0) // inptr[DCTSIZE*1]
  1987. muleq_s.w.phl v0, t0, s0 // tmp0 ...
  1988. lw t2, 32(a0) // inptr[DCTSIZE*2]
  1989. lw t3, 48(a0) // inptr[DCTSIZE*3]
  1990. lw t4, 64(a0) // inptr[DCTSIZE*4]
  1991. lw t5, 80(a0) // inptr[DCTSIZE*5]
  1992. muleq_s.w.phr t0, t0, s0 // ... tmp0 ...
  1993. lw t6, 96(a0) // inptr[DCTSIZE*6]
  1994. lw t7, 112(a0) // inptr[DCTSIZE*7]
  1995. or s4, t1, t2
  1996. or s5, t3, t4
  1997. bnez s4, 1f
  1998. ins t0, v0, 16, 16 // ... tmp0
  1999. bnez s5, 1f
  2000. or s6, t5, t6
  2001. or s6, s6, t7
  2002. bnez s6, 1f
  2003. sw t0, 0(a2) // wsptr[DCTSIZE*0]
  2004. sw t0, 16(a2) // wsptr[DCTSIZE*1]
  2005. sw t0, 32(a2) // wsptr[DCTSIZE*2]
  2006. sw t0, 48(a2) // wsptr[DCTSIZE*3]
  2007. sw t0, 64(a2) // wsptr[DCTSIZE*4]
  2008. sw t0, 80(a2) // wsptr[DCTSIZE*5]
  2009. sw t0, 96(a2) // wsptr[DCTSIZE*6]
  2010. sw t0, 112(a2) // wsptr[DCTSIZE*7]
  2011. addiu a0, a0, 4
  2012. b 2f
  2013. addiu a1, a1, 4
  2014. 1:
  2015. lw s1, 32(a1) // quantptr[DCTSIZE*2]
  2016. lw s2, 64(a1) // quantptr[DCTSIZE*4]
  2017. muleq_s.w.phl v0, t2, s1 // tmp1 ...
  2018. muleq_s.w.phr t2, t2, s1 // ... tmp1 ...
  2019. lw s0, 16(a1) // quantptr[DCTSIZE*1]
  2020. lw s1, 48(a1) // quantptr[DCTSIZE*3]
  2021. lw s3, 96(a1) // quantptr[DCTSIZE*6]
  2022. muleq_s.w.phl v1, t4, s2 // tmp2 ...
  2023. muleq_s.w.phr t4, t4, s2 // ... tmp2 ...
  2024. lw s2, 80(a1) // quantptr[DCTSIZE*5]
  2025. lw t8, 4(AT) // FIX(1.414213562)
  2026. ins t2, v0, 16, 16 // ... tmp1
  2027. muleq_s.w.phl v0, t6, s3 // tmp3 ...
  2028. muleq_s.w.phr t6, t6, s3 // ... tmp3 ...
  2029. ins t4, v1, 16, 16 // ... tmp2
  2030. addq.ph s4, t0, t4 // tmp10
  2031. subq.ph s5, t0, t4 // tmp11
  2032. ins t6, v0, 16, 16 // ... tmp3
  2033. subq.ph s6, t2, t6 // tmp12 ...
  2034. addq.ph s7, t2, t6 // tmp13
  2035. mulq_s.ph s6, s6, t8 // ... tmp12 ...
  2036. addq.ph t0, s4, s7 // tmp0
  2037. subq.ph t6, s4, s7 // tmp3
  2038. muleq_s.w.phl v0, t1, s0 // tmp4 ...
  2039. muleq_s.w.phr t1, t1, s0 // ... tmp4 ...
  2040. shll_s.ph s6, s6, 1 // x2
  2041. lw s3, 112(a1) // quantptr[DCTSIZE*7]
  2042. subq.ph s6, s6, s7 // ... tmp12
  2043. muleq_s.w.phl v1, t7, s3 // tmp7 ...
  2044. muleq_s.w.phr t7, t7, s3 // ... tmp7 ...
  2045. ins t1, v0, 16, 16 // ... tmp4
  2046. addq.ph t2, s5, s6 // tmp1
  2047. subq.ph t4, s5, s6 // tmp2
  2048. muleq_s.w.phl v0, t5, s2 // tmp6 ...
  2049. muleq_s.w.phr t5, t5, s2 // ... tmp6 ...
  2050. ins t7, v1, 16, 16 // ... tmp7
  2051. addq.ph s5, t1, t7 // z11
  2052. subq.ph s6, t1, t7 // z12
  2053. muleq_s.w.phl v1, t3, s1 // tmp5 ...
  2054. muleq_s.w.phr t3, t3, s1 // ... tmp5 ...
  2055. ins t5, v0, 16, 16 // ... tmp6
  2056. ins t3, v1, 16, 16 // ... tmp5
  2057. addq.ph s7, t5, t3 // z13
  2058. subq.ph v0, t5, t3 // z10
  2059. addq.ph t7, s5, s7 // tmp7
  2060. subq.ph s5, s5, s7 // tmp11 ...
  2061. addq.ph v1, v0, s6 // z5 ...
  2062. mulq_s.ph s5, s5, t8 // ... tmp11
  2063. lw t8, 8(AT) // FIX(1.847759065)
  2064. lw s4, 0(AT) // FIX(1.082392200)
  2065. addq.ph s0, t0, t7
  2066. subq.ph s1, t0, t7
  2067. mulq_s.ph v1, v1, t8 // ... z5
  2068. shll_s.ph s5, s5, 1 // x2
  2069. lw t8, 12(AT) // FIX(-2.613125930)
  2070. sw s0, 0(a2) // wsptr[DCTSIZE*0]
  2071. shll_s.ph v0, v0, 1 // x4
  2072. mulq_s.ph v0, v0, t8 // tmp12 ...
  2073. mulq_s.ph s4, s6, s4 // tmp10 ...
  2074. shll_s.ph v1, v1, 1 // x2
  2075. addiu a0, a0, 4
  2076. addiu a1, a1, 4
  2077. sw s1, 112(a2) // wsptr[DCTSIZE*7]
  2078. shll_s.ph s6, v0, 1 // x4
  2079. shll_s.ph s4, s4, 1 // x2
  2080. addq.ph s6, s6, v1 // ... tmp12
  2081. subq.ph t5, s6, t7 // tmp6
  2082. subq.ph s4, s4, v1 // ... tmp10
  2083. subq.ph t3, s5, t5 // tmp5
  2084. addq.ph s2, t2, t5
  2085. addq.ph t1, s4, t3 // tmp4
  2086. subq.ph s3, t2, t5
  2087. sw s2, 16(a2) // wsptr[DCTSIZE*1]
  2088. sw s3, 96(a2) // wsptr[DCTSIZE*6]
  2089. addq.ph v0, t4, t3
  2090. subq.ph v1, t4, t3
  2091. sw v0, 32(a2) // wsptr[DCTSIZE*2]
  2092. sw v1, 80(a2) // wsptr[DCTSIZE*5]
  2093. addq.ph v0, t6, t1
  2094. subq.ph v1, t6, t1
  2095. sw v0, 64(a2) // wsptr[DCTSIZE*4]
  2096. sw v1, 48(a2) // wsptr[DCTSIZE*3]
  2097. 2:
  2098. bne a0, t9, 0b
  2099. addiu a2, a2, 4
  2100. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  2101. j ra
  2102. nop
  2103. END(jsimd_idct_ifast_cols_dspr2)
  2104. /*****************************************************************************/
  2105. LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
  2106. /*
  2107. * a0 = wsptr
  2108. * a1 = output_buf
  2109. * a2 = output_col
  2110. * a3 = mips_idct_ifast_coefs
  2111. */
  2112. SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
  2113. addiu t9, a0, 128 // end address
  2114. lui s8, 0x8080
  2115. ori s8, s8, 0x8080
  2116. 0:
  2117. lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs)
  2118. lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a
  2119. lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A
  2120. lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c
  2121. lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C
  2122. lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e
  2123. lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E
  2124. lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g
  2125. lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G
  2126. precrq.ph.w t1, s0, t0 // B b
  2127. ins t0, s0, 16, 16 // A a
  2128. bnez t1, 1f
  2129. or s0, t2, s2
  2130. bnez s0, 1f
  2131. or s0, t4, s4
  2132. bnez s0, 1f
  2133. or s0, t6, s6
  2134. bnez s0, 1f
  2135. shll_s.ph s0, t0, 2 // A a
  2136. lw a3, 0(a1)
  2137. lw AT, 4(a1)
  2138. precrq.ph.w t0, s0, s0 // A A
  2139. ins s0, s0, 16, 16 // a a
  2140. addu a3, a3, a2
  2141. addu AT, AT, a2
  2142. precrq.qb.ph t0, t0, t0 // A A A A
  2143. precrq.qb.ph s0, s0, s0 // a a a a
  2144. addu.qb s0, s0, s8
  2145. addu.qb t0, t0, s8
  2146. sw s0, 0(a3)
  2147. sw s0, 4(a3)
  2148. sw t0, 0(AT)
  2149. sw t0, 4(AT)
  2150. addiu a0, a0, 32
  2151. bne a0, t9, 0b
  2152. addiu a1, a1, 8
  2153. b 2f
  2154. nop
  2155. 1:
  2156. precrq.ph.w t3, s2, t2
  2157. ins t2, s2, 16, 16
  2158. precrq.ph.w t5, s4, t4
  2159. ins t4, s4, 16, 16
  2160. precrq.ph.w t7, s6, t6
  2161. ins t6, s6, 16, 16
  2162. lw t8, 4(AT) // FIX(1.414213562)
  2163. addq.ph s4, t0, t4 // tmp10
  2164. subq.ph s5, t0, t4 // tmp11
  2165. subq.ph s6, t2, t6 // tmp12 ...
  2166. addq.ph s7, t2, t6 // tmp13
  2167. mulq_s.ph s6, s6, t8 // ... tmp12 ...
  2168. addq.ph t0, s4, s7 // tmp0
  2169. subq.ph t6, s4, s7 // tmp3
  2170. shll_s.ph s6, s6, 1 // x2
  2171. subq.ph s6, s6, s7 // ... tmp12
  2172. addq.ph t2, s5, s6 // tmp1
  2173. subq.ph t4, s5, s6 // tmp2
  2174. addq.ph s5, t1, t7 // z11
  2175. subq.ph s6, t1, t7 // z12
  2176. addq.ph s7, t5, t3 // z13
  2177. subq.ph v0, t5, t3 // z10
  2178. addq.ph t7, s5, s7 // tmp7
  2179. subq.ph s5, s5, s7 // tmp11 ...
  2180. addq.ph v1, v0, s6 // z5 ...
  2181. mulq_s.ph s5, s5, t8 // ... tmp11
  2182. lw t8, 8(AT) // FIX(1.847759065)
  2183. lw s4, 0(AT) // FIX(1.082392200)
  2184. addq.ph s0, t0, t7 // tmp0 + tmp7
  2185. subq.ph s7, t0, t7 // tmp0 - tmp7
  2186. mulq_s.ph v1, v1, t8 // ... z5
  2187. lw a3, 0(a1)
  2188. lw t8, 12(AT) // FIX(-2.613125930)
  2189. shll_s.ph s5, s5, 1 // x2
  2190. addu a3, a3, a2
  2191. shll_s.ph v0, v0, 1 // x4
  2192. mulq_s.ph v0, v0, t8 // tmp12 ...
  2193. mulq_s.ph s4, s6, s4 // tmp10 ...
  2194. shll_s.ph v1, v1, 1 // x2
  2195. addiu a0, a0, 32
  2196. addiu a1, a1, 8
  2197. shll_s.ph s6, v0, 1 // x4
  2198. shll_s.ph s4, s4, 1 // x2
  2199. addq.ph s6, s6, v1 // ... tmp12
  2200. shll_s.ph s0, s0, 2
  2201. subq.ph t5, s6, t7 // tmp6
  2202. subq.ph s4, s4, v1 // ... tmp10
  2203. subq.ph t3, s5, t5 // tmp5
  2204. shll_s.ph s7, s7, 2
  2205. addq.ph t1, s4, t3 // tmp4
  2206. addq.ph s1, t2, t5 // tmp1 + tmp6
  2207. subq.ph s6, t2, t5 // tmp1 - tmp6
  2208. addq.ph s2, t4, t3 // tmp2 + tmp5
  2209. subq.ph s5, t4, t3 // tmp2 - tmp5
  2210. addq.ph s4, t6, t1 // tmp3 + tmp4
  2211. subq.ph s3, t6, t1 // tmp3 - tmp4
  2212. shll_s.ph s1, s1, 2
  2213. shll_s.ph s2, s2, 2
  2214. shll_s.ph s3, s3, 2
  2215. shll_s.ph s4, s4, 2
  2216. shll_s.ph s5, s5, 2
  2217. shll_s.ph s6, s6, 2
  2218. precrq.ph.w t0, s1, s0 // B A
  2219. ins s0, s1, 16, 16 // b a
  2220. precrq.ph.w t2, s3, s2 // D C
  2221. ins s2, s3, 16, 16 // d c
  2222. precrq.ph.w t4, s5, s4 // F E
  2223. ins s4, s5, 16, 16 // f e
  2224. precrq.ph.w t6, s7, s6 // H G
  2225. ins s6, s7, 16, 16 // h g
  2226. precrq.qb.ph t0, t2, t0 // D C B A
  2227. precrq.qb.ph s0, s2, s0 // d c b a
  2228. precrq.qb.ph t4, t6, t4 // H G F E
  2229. precrq.qb.ph s4, s6, s4 // h g f e
  2230. addu.qb s0, s0, s8
  2231. addu.qb s4, s4, s8
  2232. sw s0, 0(a3) // outptr[0/1/2/3] d c b a
  2233. sw s4, 4(a3) // outptr[4/5/6/7] h g f e
  2234. lw a3, -4(a1)
  2235. addu.qb t0, t0, s8
  2236. addu a3, a3, a2
  2237. addu.qb t4, t4, s8
  2238. sw t0, 0(a3) // outptr[0/1/2/3] D C B A
  2239. bne a0, t9, 0b
  2240. sw t4, 4(a3) // outptr[4/5/6/7] H G F E
  2241. 2:
  2242. RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
  2243. j ra
  2244. nop
  2245. END(jsimd_idct_ifast_rows_dspr2)
  2246. /*****************************************************************************/
  2247. LEAF_DSPR2(jsimd_fdct_islow_dspr2)
  2248. /*
  2249. * a0 = data
  2250. */
  2251. SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
  2252. lui t0, 6437
  2253. ori t0, 2260
  2254. lui t1, 9633
  2255. ori t1, 11363
  2256. lui t2, 0xd39e
  2257. ori t2, 0xe6dc
  2258. lui t3, 0xf72d
  2259. ori t3, 9633
  2260. lui t4, 2261
  2261. ori t4, 9633
  2262. lui t5, 0xd39e
  2263. ori t5, 6437
  2264. lui t6, 9633
  2265. ori t6, 0xd39d
  2266. lui t7, 0xe6dc
  2267. ori t7, 2260
  2268. lui t8, 4433
  2269. ori t8, 10703
  2270. lui t9, 0xd630
  2271. ori t9, 4433
  2272. li s8, 8
  2273. move a1, a0
  2274. 1:
  2275. lw s0, 0(a1) // tmp0 = 1|0
  2276. lw s1, 4(a1) // tmp1 = 3|2
  2277. lw s2, 8(a1) // tmp2 = 5|4
  2278. lw s3, 12(a1) // tmp3 = 7|6
  2279. packrl.ph s1, s1, s1 // tmp1 = 2|3
  2280. packrl.ph s3, s3, s3 // tmp3 = 6|7
  2281. subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4
  2282. subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7
  2283. mult $0, $0 // ac0 = 0
  2284. dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260
  2285. dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363
  2286. mult $ac1, $0, $0 // ac1 = 0
  2287. dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436
  2288. dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633
  2289. mult $ac2, $0, $0 // ac2 = 0
  2290. dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633
  2291. dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437
  2292. mult $ac3, $0, $0 // ac3 = 0
  2293. dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363
  2294. dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260
  2295. addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3
  2296. addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0
  2297. extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
  2298. extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
  2299. extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11
  2300. extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11
  2301. addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10
  2302. subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13
  2303. sh s0, 2(a1)
  2304. sh s1, 6(a1)
  2305. sh s2, 10(a1)
  2306. sh s3, 14(a1)
  2307. mult $0, $0 // ac0 = 0
  2308. dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703
  2309. mult $ac1, $0, $0 // ac1 = 0
  2310. dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433
  2311. sra s4, s5, 16 // tmp4 = t11
  2312. addiu a1, a1, 16
  2313. addiu s8, s8, -1
  2314. extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
  2315. extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
  2316. addu s2, s5, s4 // tmp2 = t10 + t11
  2317. subu s3, s5, s4 // tmp3 = t10 - t11
  2318. sll s2, s2, 2 // tmp2 = (t10 + t11) << 2
  2319. sll s3, s3, 2 // tmp3 = (t10 - t11) << 2
  2320. sh s2, -16(a1)
  2321. sh s3, -8(a1)
  2322. sh s0, -12(a1)
  2323. bgtz s8, 1b
  2324. sh s1, -4(a1)
  2325. li t0, 2260
  2326. li t1, 11363
  2327. li t2, 9633
  2328. li t3, 6436
  2329. li t4, 6437
  2330. li t5, 2261
  2331. li t6, 11362
  2332. li t7, 2259
  2333. li t8, 4433
  2334. li t9, 10703
  2335. li a1, 10704
  2336. li s8, 8
  2337. 2:
  2338. lh a2, 0(a0) // 0
  2339. lh a3, 16(a0) // 8
  2340. lh v0, 32(a0) // 16
  2341. lh v1, 48(a0) // 24
  2342. lh s4, 64(a0) // 32
  2343. lh s5, 80(a0) // 40
  2344. lh s6, 96(a0) // 48
  2345. lh s7, 112(a0) // 56
  2346. addu s2, v0, s5 // tmp2 = 16 + 40
  2347. subu s5, v0, s5 // tmp5 = 16 - 40
  2348. addu s3, v1, s4 // tmp3 = 24 + 32
  2349. subu s4, v1, s4 // tmp4 = 24 - 32
  2350. addu s0, a2, s7 // tmp0 = 0 + 56
  2351. subu s7, a2, s7 // tmp7 = 0 - 56
  2352. addu s1, a3, s6 // tmp1 = 8 + 48
  2353. subu s6, a3, s6 // tmp6 = 8 - 48
  2354. addu a2, s0, s3 // tmp10 = tmp0 + tmp3
  2355. subu v1, s0, s3 // tmp13 = tmp0 - tmp3
  2356. addu a3, s1, s2 // tmp11 = tmp1 + tmp2
  2357. subu v0, s1, s2 // tmp12 = tmp1 - tmp2
  2358. mult s7, t1 // ac0 = tmp7 * c1
  2359. madd s4, t0 // ac0 += tmp4 * c0
  2360. madd s5, t4 // ac0 += tmp5 * c4
  2361. madd s6, t2 // ac0 += tmp6 * c2
  2362. mult $ac1, s7, t2 // ac1 = tmp7 * c2
  2363. msub $ac1, s4, t3 // ac1 -= tmp4 * c3
  2364. msub $ac1, s5, t6 // ac1 -= tmp5 * c6
  2365. msub $ac1, s6, t7 // ac1 -= tmp6 * c7
  2366. mult $ac2, s7, t4 // ac2 = tmp7 * c4
  2367. madd $ac2, s4, t2 // ac2 += tmp4 * c2
  2368. madd $ac2, s5, t5 // ac2 += tmp5 * c5
  2369. msub $ac2, s6, t6 // ac2 -= tmp6 * c6
  2370. mult $ac3, s7, t0 // ac3 = tmp7 * c0
  2371. msub $ac3, s4, t1 // ac3 -= tmp4 * c1
  2372. madd $ac3, s5, t2 // ac3 += tmp5 * c2
  2373. msub $ac3, s6, t3 // ac3 -= tmp6 * c3
  2374. extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15
  2375. extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15
  2376. extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15
  2377. extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15
  2378. addiu s8, s8, -1
  2379. addu s4, a2, a3 // tmp4 = tmp10 + tmp11
  2380. subu s5, a2, a3 // tmp5 = tmp10 - tmp11
  2381. sh s0, 16(a0)
  2382. sh s1, 48(a0)
  2383. sh s2, 80(a0)
  2384. sh s3, 112(a0)
  2385. mult v0, t8 // ac0 = tmp12 * c8
  2386. madd v1, t9 // ac0 += tmp13 * c9
  2387. mult $ac1, v1, t8 // ac1 = tmp13 * c8
  2388. msub $ac1, v0, a1 // ac1 -= tmp12 * c10
  2389. addiu a0, a0, 2
  2390. extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15
  2391. extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15
  2392. shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2
  2393. shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2
  2394. sh s4, -2(a0)
  2395. sh s5, 62(a0)
  2396. sh s6, 30(a0)
  2397. bgtz s8, 2b
  2398. sh s7, 94(a0)
  2399. RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
  2400. jr ra
  2401. nop
  2402. END(jsimd_fdct_islow_dspr2)
  2403. /**************************************************************************/
  2404. LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
  2405. /*
  2406. * a0 = data
  2407. */
  2408. .set at
  2409. SAVE_REGS_ON_STACK 8, s0, s1
  2410. li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
  2411. li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
  2412. li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
  2413. li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
  2414. move v0, a0
  2415. addiu v1, v0, 128 // end address
  2416. 0:
  2417. lw t0, 0(v0) // tmp0 = 1|0
  2418. lw t1, 4(v0) // tmp1 = 3|2
  2419. lw t2, 8(v0) // tmp2 = 5|4
  2420. lw t3, 12(v0) // tmp3 = 7|6
  2421. packrl.ph t1, t1, t1 // tmp1 = 2|3
  2422. packrl.ph t3, t3, t3 // tmp3 = 6|7
  2423. subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4
  2424. subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7
  2425. addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3
  2426. addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0
  2427. addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10
  2428. subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13
  2429. sra t4, t8, 16 // tmp4 = t11
  2430. mult $0, $0 // ac0 = 0
  2431. dpa.w.ph $ac0, t9, s1
  2432. mult $ac1, $0, $0 // ac1 = 0
  2433. dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98
  2434. dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98
  2435. mult $ac2, $0, $0 // ac2 = 0
  2436. dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139
  2437. mult $ac3, $0, $0 // ac3 = 0
  2438. dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334
  2439. precrq.ph.w t0, t5, t7 // t0 = t5|t6
  2440. addq.ph t2, t8, t4 // tmp2 = t10 + t11
  2441. subq.ph t3, t8, t4 // tmp3 = t10 - t11
  2442. extr.w t4, $ac0, 8
  2443. mult $0, $0 // ac0 = 0
  2444. dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181
  2445. extr.w t0, $ac1, 8 // t0 = z5
  2446. extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139)
  2447. extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334)
  2448. extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181)
  2449. add t6, t1, t0 // t6 = z2
  2450. add t7, t7, t0 // t7 = z4
  2451. subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3
  2452. addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3
  2453. addq.ph t1, t0, t6 // t1 = z13 + z2
  2454. subq.ph t6, t0, t6 // t6 = z13 - z2
  2455. addq.ph t0, t8, t7 // t0 = z11 + z4
  2456. subq.ph t7, t8, t7 // t7 = z11 - z4
  2457. addq.ph t5, t4, t9
  2458. subq.ph t4, t9, t4
  2459. sh t2, 0(v0)
  2460. sh t5, 4(v0)
  2461. sh t3, 8(v0)
  2462. sh t4, 12(v0)
  2463. sh t1, 10(v0)
  2464. sh t6, 6(v0)
  2465. sh t0, 2(v0)
  2466. sh t7, 14(v0)
  2467. addiu v0, 16
  2468. bne v1, v0, 0b
  2469. nop
  2470. move v0, a0
  2471. addiu v1, v0, 16
  2472. 1:
  2473. lh t0, 0(v0) // 0
  2474. lh t1, 16(v0) // 8
  2475. lh t2, 32(v0) // 16
  2476. lh t3, 48(v0) // 24
  2477. lh t4, 64(v0) // 32
  2478. lh t5, 80(v0) // 40
  2479. lh t6, 96(v0) // 48
  2480. lh t7, 112(v0) // 56
  2481. add t8, t0, t7 // t8 = tmp0
  2482. sub t7, t0, t7 // t7 = tmp7
  2483. add t0, t1, t6 // t0 = tmp1
  2484. sub t1, t1, t6 // t1 = tmp6
  2485. add t6, t2, t5 // t6 = tmp2
  2486. sub t5, t2, t5 // t5 = tmp5
  2487. add t2, t3, t4 // t2 = tmp3
  2488. sub t3, t3, t4 // t3 = tmp4
  2489. add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3
  2490. sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3
  2491. sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2
  2492. ins t8, s0, 16, 16 // t8 = tmp12|tmp13
  2493. add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2
  2494. mult $0, $0 // ac0 = 0
  2495. dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181
  2496. add s0, t4, t2 // t8 = tmp10+tmp11
  2497. sub t4, t4, t2 // t4 = tmp10-tmp11
  2498. sh s0, 0(v0)
  2499. sh t4, 64(v0)
  2500. extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13, FIX_0_707106781)
  2501. addq.ph t4, t8, t2 // t9 = tmp13 + z1
  2502. subq.ph t8, t8, t2 // t2 = tmp13 - z1
  2503. sh t4, 32(v0)
  2504. sh t8, 96(v0)
  2505. add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5
  2506. add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6
  2507. add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7
  2508. andi t4, a1, 0xffff
  2509. mul s0, t1, t4
  2510. sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
  2511. ins t1, t3, 16, 16 // t1 = tmp10|tmp12
  2512. mult $0, $0 // ac0 = 0
  2513. mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98
  2514. extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12, FIX_0_382683433)
  2515. add t2, t7, t8 // t2 = tmp7 + z5
  2516. sub t7, t7, t8 // t7 = tmp7 - z5
  2517. andi t4, a2, 0xffff
  2518. mul t8, t3, t4
  2519. sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
  2520. andi t4, s1, 0xffff
  2521. mul t6, t0, t4
  2522. sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
  2523. add t0, t6, t8 // t0 = z3 + z2
  2524. sub t1, t6, t8 // t1 = z3 - z2
  2525. add t3, t6, s0 // t3 = z3 + z4
  2526. sub t4, t6, s0 // t4 = z3 - z4
  2527. sub t5, t2, t1 // t5 = dataptr[5]
  2528. sub t6, t7, t0 // t6 = dataptr[3]
  2529. add t3, t2, t3 // t3 = dataptr[1]
  2530. add t4, t7, t4 // t4 = dataptr[7]
  2531. sh t5, 80(v0)
  2532. sh t6, 48(v0)
  2533. sh t3, 16(v0)
  2534. sh t4, 112(v0)
  2535. addiu v0, 2
  2536. bne v0, v1, 1b
  2537. nop
  2538. RESTORE_REGS_FROM_STACK 8, s0, s1
  2539. j ra
  2540. nop
  2541. END(jsimd_fdct_ifast_dspr2)
  2542. /*****************************************************************************/
  2543. LEAF_DSPR2(jsimd_quantize_dspr2)
  2544. /*
  2545. * a0 = coef_block
  2546. * a1 = divisors
  2547. * a2 = workspace
  2548. */
  2549. .set at
  2550. SAVE_REGS_ON_STACK 16, s0, s1, s2
  2551. addiu v0, a2, 124 // v0 = workspace_end
  2552. lh t0, 0(a2)
  2553. lh t1, 0(a1)
  2554. lh t2, 128(a1)
  2555. sra t3, t0, 15
  2556. sll t3, t3, 1
  2557. addiu t3, t3, 1
  2558. mul t0, t0, t3
  2559. lh t4, 384(a1)
  2560. lh t5, 130(a1)
  2561. lh t6, 2(a2)
  2562. lh t7, 2(a1)
  2563. lh t8, 386(a1)
  2564. 1:
  2565. andi t1, 0xffff
  2566. add t9, t0, t2
  2567. andi t9, 0xffff
  2568. mul v1, t9, t1
  2569. sra s0, t6, 15
  2570. sll s0, s0, 1
  2571. addiu s0, s0, 1
  2572. addiu t9, t4, 16
  2573. srav v1, v1, t9
  2574. mul v1, v1, t3
  2575. mul t6, t6, s0
  2576. andi t7, 0xffff
  2577. addiu a2, a2, 4
  2578. addiu a1, a1, 4
  2579. add s1, t6, t5
  2580. andi s1, 0xffff
  2581. sh v1, 0(a0)
  2582. mul s2, s1, t7
  2583. addiu s1, t8, 16
  2584. srav s2, s2, s1
  2585. mul s2, s2, s0
  2586. lh t0, 0(a2)
  2587. lh t1, 0(a1)
  2588. sra t3, t0, 15
  2589. sll t3, t3, 1
  2590. addiu t3, t3, 1
  2591. mul t0, t0, t3
  2592. lh t2, 128(a1)
  2593. lh t4, 384(a1)
  2594. lh t5, 130(a1)
  2595. lh t8, 386(a1)
  2596. lh t6, 2(a2)
  2597. lh t7, 2(a1)
  2598. sh s2, 2(a0)
  2599. lh t0, 0(a2)
  2600. sra t3, t0, 15
  2601. sll t3, t3, 1
  2602. addiu t3, t3, 1
  2603. mul t0, t0, t3
  2604. bne a2, v0, 1b
  2605. addiu a0, a0, 4
  2606. andi t1, 0xffff
  2607. add t9, t0, t2
  2608. andi t9, 0xffff
  2609. mul v1, t9, t1
  2610. sra s0, t6, 15
  2611. sll s0, s0, 1
  2612. addiu s0, s0, 1
  2613. addiu t9, t4, 16
  2614. srav v1, v1, t9
  2615. mul v1, v1, t3
  2616. mul t6, t6, s0
  2617. andi t7, 0xffff
  2618. sh v1, 0(a0)
  2619. add s1, t6, t5
  2620. andi s1, 0xffff
  2621. mul s2, s1, t7
  2622. addiu s1, t8, 16
  2623. addiu a2, a2, 4
  2624. addiu a1, a1, 4
  2625. srav s2, s2, s1
  2626. mul s2, s2, s0
  2627. sh s2, 2(a0)
  2628. RESTORE_REGS_FROM_STACK 16, s0, s1, s2
  2629. j ra
  2630. nop
  2631. END(jsimd_quantize_dspr2)
  2632. #ifndef __mips_soft_float
  2633. /*****************************************************************************/
  2634. LEAF_DSPR2(jsimd_quantize_float_dspr2)
  2635. /*
  2636. * a0 = coef_block
  2637. * a1 = divisors
  2638. * a2 = workspace
  2639. */
  2640. .set at
  2641. li t1, 0x46800100 // integer representation 16384.5
  2642. mtc1 t1, f0
  2643. li t0, 63
  2644. 0:
  2645. lwc1 f2, 0(a2)
  2646. lwc1 f10, 0(a1)
  2647. lwc1 f4, 4(a2)
  2648. lwc1 f12, 4(a1)
  2649. lwc1 f6, 8(a2)
  2650. lwc1 f14, 8(a1)
  2651. lwc1 f8, 12(a2)
  2652. lwc1 f16, 12(a1)
  2653. madd.s f2, f0, f2, f10
  2654. madd.s f4, f0, f4, f12
  2655. madd.s f6, f0, f6, f14
  2656. madd.s f8, f0, f8, f16
  2657. lwc1 f10, 16(a1)
  2658. lwc1 f12, 20(a1)
  2659. trunc.w.s f2, f2
  2660. trunc.w.s f4, f4
  2661. trunc.w.s f6, f6
  2662. trunc.w.s f8, f8
  2663. lwc1 f14, 24(a1)
  2664. lwc1 f16, 28(a1)
  2665. mfc1 t1, f2
  2666. mfc1 t2, f4
  2667. mfc1 t3, f6
  2668. mfc1 t4, f8
  2669. lwc1 f2, 16(a2)
  2670. lwc1 f4, 20(a2)
  2671. lwc1 f6, 24(a2)
  2672. lwc1 f8, 28(a2)
  2673. madd.s f2, f0, f2, f10
  2674. madd.s f4, f0, f4, f12
  2675. madd.s f6, f0, f6, f14
  2676. madd.s f8, f0, f8, f16
  2677. addiu t1, t1, -16384
  2678. addiu t2, t2, -16384
  2679. addiu t3, t3, -16384
  2680. addiu t4, t4, -16384
  2681. trunc.w.s f2, f2
  2682. trunc.w.s f4, f4
  2683. trunc.w.s f6, f6
  2684. trunc.w.s f8, f8
  2685. sh t1, 0(a0)
  2686. sh t2, 2(a0)
  2687. sh t3, 4(a0)
  2688. sh t4, 6(a0)
  2689. mfc1 t1, f2
  2690. mfc1 t2, f4
  2691. mfc1 t3, f6
  2692. mfc1 t4, f8
  2693. addiu t0, t0, -8
  2694. addiu a2, a2, 32
  2695. addiu a1, a1, 32
  2696. addiu t1, t1, -16384
  2697. addiu t2, t2, -16384
  2698. addiu t3, t3, -16384
  2699. addiu t4, t4, -16384
  2700. sh t1, 8(a0)
  2701. sh t2, 10(a0)
  2702. sh t3, 12(a0)
  2703. sh t4, 14(a0)
  2704. bgez t0, 0b
  2705. addiu a0, a0, 16
  2706. j ra
  2707. nop
  2708. END(jsimd_quantize_float_dspr2)
  2709. #endif
  2710. /*****************************************************************************/
  2711. LEAF_DSPR2(jsimd_idct_2x2_dspr2)
  2712. /*
  2713. * a0 = compptr->dct_table
  2714. * a1 = coef_block
  2715. * a2 = output_buf
  2716. * a3 = output_col
  2717. */
  2718. .set at
  2719. SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
  2720. addiu sp, sp, -40
  2721. move v0, sp
  2722. addiu s2, zero, 29692
  2723. addiu s3, zero, -10426
  2724. addiu s4, zero, 6967
  2725. addiu s5, zero, -5906
  2726. lh t0, 0(a1) // t0 = inptr[DCTSIZE*0]
  2727. lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0]
  2728. lh t1, 48(a1) // t1 = inptr[DCTSIZE*3]
  2729. lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3]
  2730. mul t4, t5, t0
  2731. lh t0, 16(a1) // t0 = inptr[DCTSIZE*1]
  2732. lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1]
  2733. mul t6, t6, t1
  2734. mul t5, t5, t0
  2735. lh t2, 80(a1) // t2 = inptr[DCTSIZE*5]
  2736. lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5]
  2737. lh t3, 112(a1) // t3 = inptr[DCTSIZE*7]
  2738. lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7]
  2739. mul t7, t7, t2
  2740. mult zero, zero
  2741. mul t8, t8, t3
  2742. li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff)
  2743. li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff)
  2744. ins t6, t5, 16, 16 // t6 = t5|t6
  2745. sll t4, t4, 15
  2746. dpa.w.ph $ac0, t6, s0
  2747. lh t1, 2(a1)
  2748. lh t6, 2(a0)
  2749. ins t8, t7, 16, 16 // t8 = t7|t8
  2750. dpa.w.ph $ac0, t8, s1
  2751. mflo t0, $ac0
  2752. mul t5, t6, t1
  2753. lh t1, 18(a1)
  2754. lh t6, 18(a0)
  2755. lh t2, 50(a1)
  2756. lh t7, 50(a0)
  2757. mul t6, t6, t1
  2758. subu t8, t4, t0
  2759. mul t7, t7, t2
  2760. addu t0, t4, t0
  2761. shra_r.w t0, t0, 13
  2762. lh t1, 82(a1)
  2763. lh t2, 82(a0)
  2764. lh t3, 114(a1)
  2765. lh t4, 114(a0)
  2766. shra_r.w t8, t8, 13
  2767. mul t1, t1, t2
  2768. mul t3, t3, t4
  2769. sw t0, 0(v0)
  2770. sw t8, 20(v0)
  2771. sll t4, t5, 15
  2772. ins t7, t6, 16, 16
  2773. mult zero, zero
  2774. dpa.w.ph $ac0, t7, s0
  2775. ins t3, t1, 16, 16
  2776. lh t1, 6(a1)
  2777. lh t6, 6(a0)
  2778. dpa.w.ph $ac0, t3, s1
  2779. mflo t0, $ac0
  2780. mul t5, t6, t1
  2781. lh t1, 22(a1)
  2782. lh t6, 22(a0)
  2783. lh t2, 54(a1)
  2784. lh t7, 54(a0)
  2785. mul t6, t6, t1
  2786. subu t8, t4, t0
  2787. mul t7, t7, t2
  2788. addu t0, t4, t0
  2789. shra_r.w t0, t0, 13
  2790. lh t1, 86(a1)
  2791. lh t2, 86(a0)
  2792. lh t3, 118(a1)
  2793. lh t4, 118(a0)
  2794. shra_r.w t8, t8, 13
  2795. mul t1, t1, t2
  2796. mul t3, t3, t4
  2797. sw t0, 4(v0)
  2798. sw t8, 24(v0)
  2799. sll t4, t5, 15
  2800. ins t7, t6, 16, 16
  2801. mult zero, zero
  2802. dpa.w.ph $ac0, t7, s0
  2803. ins t3, t1, 16, 16
  2804. lh t1, 10(a1)
  2805. lh t6, 10(a0)
  2806. dpa.w.ph $ac0, t3, s1
  2807. mflo t0, $ac0
  2808. mul t5, t6, t1
  2809. lh t1, 26(a1)
  2810. lh t6, 26(a0)
  2811. lh t2, 58(a1)
  2812. lh t7, 58(a0)
  2813. mul t6, t6, t1
  2814. subu t8, t4, t0
  2815. mul t7, t7, t2
  2816. addu t0, t4, t0
  2817. shra_r.w t0, t0, 13
  2818. lh t1, 90(a1)
  2819. lh t2, 90(a0)
  2820. lh t3, 122(a1)
  2821. lh t4, 122(a0)
  2822. shra_r.w t8, t8, 13
  2823. mul t1, t1, t2
  2824. mul t3, t3, t4
  2825. sw t0, 8(v0)
  2826. sw t8, 28(v0)
  2827. sll t4, t5, 15
  2828. ins t7, t6, 16, 16
  2829. mult zero, zero
  2830. dpa.w.ph $ac0, t7, s0
  2831. ins t3, t1, 16, 16
  2832. lh t1, 14(a1)
  2833. lh t6, 14(a0)
  2834. dpa.w.ph $ac0, t3, s1
  2835. mflo t0, $ac0
  2836. mul t5, t6, t1
  2837. lh t1, 30(a1)
  2838. lh t6, 30(a0)
  2839. lh t2, 62(a1)
  2840. lh t7, 62(a0)
  2841. mul t6, t6, t1
  2842. subu t8, t4, t0
  2843. mul t7, t7, t2
  2844. addu t0, t4, t0
  2845. shra_r.w t0, t0, 13
  2846. lh t1, 94(a1)
  2847. lh t2, 94(a0)
  2848. lh t3, 126(a1)
  2849. lh t4, 126(a0)
  2850. shra_r.w t8, t8, 13
  2851. mul t1, t1, t2
  2852. mul t3, t3, t4
  2853. sw t0, 12(v0)
  2854. sw t8, 32(v0)
  2855. sll t4, t5, 15
  2856. ins t7, t6, 16, 16
  2857. mult zero, zero
  2858. dpa.w.ph $ac0, t7, s0
  2859. ins t3, t1, 16, 16
  2860. dpa.w.ph $ac0, t3, s1
  2861. mflo t0, $ac0
  2862. lw t9, 0(a2)
  2863. lw t3, 0(v0)
  2864. lw t7, 4(v0)
  2865. lw t1, 8(v0)
  2866. addu t9, t9, a3
  2867. sll t3, t3, 15
  2868. subu t8, t4, t0
  2869. addu t0, t4, t0
  2870. shra_r.w t0, t0, 13
  2871. shra_r.w t8, t8, 13
  2872. sw t0, 16(v0)
  2873. sw t8, 36(v0)
  2874. lw t5, 12(v0)
  2875. lw t6, 16(v0)
  2876. mult t7, s2
  2877. madd t1, s3
  2878. madd t5, s4
  2879. madd t6, s5
  2880. lw t5, 24(v0)
  2881. lw t7, 28(v0)
  2882. mflo t0, $ac0
  2883. lw t8, 32(v0)
  2884. lw t2, 36(v0)
  2885. mult $ac1, t5, s2
  2886. madd $ac1, t7, s3
  2887. madd $ac1, t8, s4
  2888. madd $ac1, t2, s5
  2889. addu t1, t3, t0
  2890. subu t6, t3, t0
  2891. shra_r.w t1, t1, 20
  2892. shra_r.w t6, t6, 20
  2893. mflo t4, $ac1
  2894. shll_s.w t1, t1, 24
  2895. shll_s.w t6, t6, 24
  2896. sra t1, t1, 24
  2897. sra t6, t6, 24
  2898. addiu t1, t1, 128
  2899. addiu t6, t6, 128
  2900. lw t0, 20(v0)
  2901. sb t1, 0(t9)
  2902. sb t6, 1(t9)
  2903. sll t0, t0, 15
  2904. lw t9, 4(a2)
  2905. addu t1, t0, t4
  2906. subu t6, t0, t4
  2907. addu t9, t9, a3
  2908. shra_r.w t1, t1, 20
  2909. shra_r.w t6, t6, 20
  2910. shll_s.w t1, t1, 24
  2911. shll_s.w t6, t6, 24
  2912. sra t1, t1, 24
  2913. sra t6, t6, 24
  2914. addiu t1, t1, 128
  2915. addiu t6, t6, 128
  2916. sb t1, 0(t9)
  2917. sb t6, 1(t9)
  2918. addiu sp, sp, 40
  2919. RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
  2920. j ra
  2921. nop
  2922. END(jsimd_idct_2x2_dspr2)
  2923. /*****************************************************************************/
  2924. LEAF_DSPR2(jsimd_idct_4x4_dspr2)
  2925. /*
  2926. * a0 = compptr->dct_table
  2927. * a1 = coef_block
  2928. * a2 = output_buf
  2929. * a3 = output_col
  2930. * 16(sp) = workspace[DCTSIZE*4]; // buffers data between passes
  2931. */
  2932. .set at
  2933. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  2934. lw v1, 48(sp)
  2935. move t0, a1
  2936. move t1, v1
  2937. li t9, 4
  2938. li s0, 0x2e75f93e
  2939. li s1, 0x21f9ba79
  2940. li s2, 0xecc2efb0
  2941. li s3, 0x52031ccd
  2942. 0:
  2943. lh s6, 32(t0) // inptr[DCTSIZE*2]
  2944. lh t6, 32(a0) // quantptr[DCTSIZE*2]
  2945. lh s7, 96(t0) // inptr[DCTSIZE*6]
  2946. lh t7, 96(a0) // quantptr[DCTSIZE*6]
  2947. mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
  2948. lh s4, 0(t0) // inptr[DCTSIZE*0]
  2949. mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
  2950. lh s5, 0(a0) // quantptr[0]
  2951. li s6, 15137
  2952. li s7, 6270
  2953. mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
  2954. mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
  2955. lh t5, 112(t0) // inptr[DCTSIZE*7]
  2956. mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
  2957. lh s4, 112(a0) // quantptr[DCTSIZE*7]
  2958. lh v0, 80(t0) // inptr[DCTSIZE*5]
  2959. lh s5, 80(a0) // quantptr[DCTSIZE*5]
  2960. lh s6, 48(a0) // quantptr[DCTSIZE*3]
  2961. sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
  2962. lh s7, 16(a0) // quantptr[DCTSIZE*1]
  2963. lh t8, 16(t0) // inptr[DCTSIZE*1]
  2964. subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
  2965. lh t7, 48(t0) // inptr[DCTSIZE*3]
  2966. mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
  2967. mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
  2968. mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
  2969. mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
  2970. addu t3, t2, t6 // tmp10 = tmp0 + z2
  2971. subu t4, t2, t6 // tmp10 = tmp0 - z2
  2972. mult $ac0, zero, zero
  2973. mult $ac1, zero, zero
  2974. ins t5, v0, 16, 16
  2975. ins t7, t8, 16, 16
  2976. addiu t9, t9, -1
  2977. dpa.w.ph $ac0, t5, s0
  2978. dpa.w.ph $ac0, t7, s1
  2979. dpa.w.ph $ac1, t5, s2
  2980. dpa.w.ph $ac1, t7, s3
  2981. mflo s4, $ac0
  2982. mflo s5, $ac1
  2983. addiu a0, a0, 2
  2984. addiu t1, t1, 4
  2985. addiu t0, t0, 2
  2986. addu t6, t4, s4
  2987. subu t5, t4, s4
  2988. addu s6, t3, s5
  2989. subu s7, t3, s5
  2990. shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12)
  2991. shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12)
  2992. shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
  2993. shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
  2994. sw t6, 28(t1)
  2995. sw t5, 60(t1)
  2996. sw s6, -4(t1)
  2997. bgtz t9, 0b
  2998. sw s7, 92(t1)
  2999. // second loop three pass
  3000. li t9, 3
  3001. 1:
  3002. lh s6, 34(t0) // inptr[DCTSIZE*2]
  3003. lh t6, 34(a0) // quantptr[DCTSIZE*2]
  3004. lh s7, 98(t0) // inptr[DCTSIZE*6]
  3005. lh t7, 98(a0) // quantptr[DCTSIZE*6]
  3006. mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
  3007. lh s4, 2(t0) // inptr[DCTSIZE*0]
  3008. mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
  3009. lh s5, 2(a0) // quantptr[DCTSIZE*0]
  3010. li s6, 15137
  3011. li s7, 6270
  3012. mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
  3013. mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
  3014. lh t5, 114(t0) // inptr[DCTSIZE*7]
  3015. mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
  3016. lh s4, 114(a0) // quantptr[DCTSIZE*7]
  3017. lh s5, 82(a0) // quantptr[DCTSIZE*5]
  3018. lh t6, 82(t0) // inptr[DCTSIZE*5]
  3019. sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
  3020. lh s6, 50(a0) // quantptr[DCTSIZE*3]
  3021. lh t8, 18(t0) // inptr[DCTSIZE*1]
  3022. subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
  3023. lh t7, 50(t0) // inptr[DCTSIZE*3]
  3024. lh s7, 18(a0) // quantptr[DCTSIZE*1]
  3025. mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
  3026. mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
  3027. mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
  3028. mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
  3029. addu t3, t2, v0 // tmp10 = tmp0 + z2
  3030. subu t4, t2, v0 // tmp10 = tmp0 - z2
  3031. mult $ac0, zero, zero
  3032. mult $ac1, zero, zero
  3033. ins t5, t6, 16, 16
  3034. ins t7, t8, 16, 16
  3035. dpa.w.ph $ac0, t5, s0
  3036. dpa.w.ph $ac0, t7, s1
  3037. dpa.w.ph $ac1, t5, s2
  3038. dpa.w.ph $ac1, t7, s3
  3039. mflo t5, $ac0
  3040. mflo t6, $ac1
  3041. addiu t9, t9, -1
  3042. addiu t0, t0, 2
  3043. addiu a0, a0, 2
  3044. addiu t1, t1, 4
  3045. addu s5, t4, t5
  3046. subu s4, t4, t5
  3047. addu s6, t3, t6
  3048. subu s7, t3, t6
  3049. shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12)
  3050. shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12)
  3051. shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
  3052. shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
  3053. sw s5, 32(t1)
  3054. sw s4, 64(t1)
  3055. sw s6, 0(t1)
  3056. bgtz t9, 1b
  3057. sw s7, 96(t1)
  3058. move t1, v1
  3059. li s4, 15137
  3060. lw s6, 8(t1) // wsptr[2]
  3061. li s5, 6270
  3062. lw s7, 24(t1) // wsptr[6]
  3063. mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
  3064. lw t2, 0(t1) // wsptr[0]
  3065. mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
  3066. lh t5, 28(t1) // wsptr[7]
  3067. lh t6, 20(t1) // wsptr[5]
  3068. lh t7, 12(t1) // wsptr[3]
  3069. lh t8, 4(t1) // wsptr[1]
  3070. ins t5, t6, 16, 16
  3071. ins t7, t8, 16, 16
  3072. mult $ac0, zero, zero
  3073. dpa.w.ph $ac0, t5, s0
  3074. dpa.w.ph $ac0, t7, s1
  3075. mult $ac1, zero, zero
  3076. dpa.w.ph $ac1, t5, s2
  3077. dpa.w.ph $ac1, t7, s3
  3078. sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
  3079. mflo s6, $ac0
  3080. // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
  3081. subu s4, s4, s5
  3082. addu t3, t2, s4 // tmp10 = tmp0 + z2
  3083. mflo s7, $ac1
  3084. subu t4, t2, s4 // tmp10 = tmp0 - z2
  3085. addu t7, t4, s6
  3086. subu t8, t4, s6
  3087. addu t5, t3, s7
  3088. subu t6, t3, s7
  3089. shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
  3090. shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
  3091. shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
  3092. shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
  3093. sll s4, t9, 2
  3094. lw v0, 0(a2) // output_buf[ctr]
  3095. shll_s.w t5, t5, 24
  3096. shll_s.w t6, t6, 24
  3097. shll_s.w t7, t7, 24
  3098. shll_s.w t8, t8, 24
  3099. sra t5, t5, 24
  3100. sra t6, t6, 24
  3101. sra t7, t7, 24
  3102. sra t8, t8, 24
  3103. addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
  3104. addiu t5, t5, 128
  3105. addiu t6, t6, 128
  3106. addiu t7, t7, 128
  3107. addiu t8, t8, 128
  3108. sb t5, 0(v0)
  3109. sb t7, 1(v0)
  3110. sb t8, 2(v0)
  3111. sb t6, 3(v0)
  3112. // 2
  3113. li s4, 15137
  3114. lw s6, 40(t1) // wsptr[2]
  3115. li s5, 6270
  3116. lw s7, 56(t1) // wsptr[6]
  3117. mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
  3118. lw t2, 32(t1) // wsptr[0]
  3119. mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
  3120. lh t5, 60(t1) // wsptr[7]
  3121. lh t6, 52(t1) // wsptr[5]
  3122. lh t7, 44(t1) // wsptr[3]
  3123. lh t8, 36(t1) // wsptr[1]
  3124. ins t5, t6, 16, 16
  3125. ins t7, t8, 16, 16
  3126. mult $ac0, zero, zero
  3127. dpa.w.ph $ac0, t5, s0
  3128. dpa.w.ph $ac0, t7, s1
  3129. mult $ac1, zero, zero
  3130. dpa.w.ph $ac1, t5, s2
  3131. dpa.w.ph $ac1, t7, s3
  3132. sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
  3133. mflo s6, $ac0
  3134. // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
  3135. subu s4, s4, s5
  3136. addu t3, t2, s4 // tmp10 = tmp0 + z2
  3137. mflo s7, $ac1
  3138. subu t4, t2, s4 // tmp10 = tmp0 - z2
  3139. addu t7, t4, s6
  3140. subu t8, t4, s6
  3141. addu t5, t3, s7
  3142. subu t6, t3, s7
  3143. shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
  3144. shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
  3145. shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
  3146. shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
  3147. sll s4, t9, 2
  3148. lw v0, 4(a2) // output_buf[ctr]
  3149. shll_s.w t5, t5, 24
  3150. shll_s.w t6, t6, 24
  3151. shll_s.w t7, t7, 24
  3152. shll_s.w t8, t8, 24
  3153. sra t5, t5, 24
  3154. sra t6, t6, 24
  3155. sra t7, t7, 24
  3156. sra t8, t8, 24
  3157. addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
  3158. addiu t5, t5, 128
  3159. addiu t6, t6, 128
  3160. addiu t7, t7, 128
  3161. addiu t8, t8, 128
  3162. sb t5, 0(v0)
  3163. sb t7, 1(v0)
  3164. sb t8, 2(v0)
  3165. sb t6, 3(v0)
  3166. // 3
  3167. li s4, 15137
  3168. lw s6, 72(t1) // wsptr[2]
  3169. li s5, 6270
  3170. lw s7, 88(t1) // wsptr[6]
  3171. mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
  3172. lw t2, 64(t1) // wsptr[0]
  3173. mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
  3174. lh t5, 92(t1) // wsptr[7]
  3175. lh t6, 84(t1) // wsptr[5]
  3176. lh t7, 76(t1) // wsptr[3]
  3177. lh t8, 68(t1) // wsptr[1]
  3178. ins t5, t6, 16, 16
  3179. ins t7, t8, 16, 16
  3180. mult $ac0, zero, zero
  3181. dpa.w.ph $ac0, t5, s0
  3182. dpa.w.ph $ac0, t7, s1
  3183. mult $ac1, zero, zero
  3184. dpa.w.ph $ac1, t5, s2
  3185. dpa.w.ph $ac1, t7, s3
  3186. sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
  3187. mflo s6, $ac0
  3188. // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
  3189. subu s4, s4, s5
  3190. addu t3, t2, s4 // tmp10 = tmp0 + z2
  3191. mflo s7, $ac1
  3192. subu t4, t2, s4 // tmp10 = tmp0 - z2
  3193. addu t7, t4, s6
  3194. subu t8, t4, s6
  3195. addu t5, t3, s7
  3196. subu t6, t3, s7
  3197. shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
  3198. shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
  3199. shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
  3200. shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
  3201. sll s4, t9, 2
  3202. lw v0, 8(a2) // output_buf[ctr]
  3203. shll_s.w t5, t5, 24
  3204. shll_s.w t6, t6, 24
  3205. shll_s.w t7, t7, 24
  3206. shll_s.w t8, t8, 24
  3207. sra t5, t5, 24
  3208. sra t6, t6, 24
  3209. sra t7, t7, 24
  3210. sra t8, t8, 24
  3211. addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
  3212. addiu t5, t5, 128
  3213. addiu t6, t6, 128
  3214. addiu t7, t7, 128
  3215. addiu t8, t8, 128
  3216. sb t5, 0(v0)
  3217. sb t7, 1(v0)
  3218. sb t8, 2(v0)
  3219. sb t6, 3(v0)
  3220. li s4, 15137
  3221. lw s6, 104(t1) // wsptr[2]
  3222. li s5, 6270
  3223. lw s7, 120(t1) // wsptr[6]
  3224. mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
  3225. lw t2, 96(t1) // wsptr[0]
  3226. mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
  3227. lh t5, 124(t1) // wsptr[7]
  3228. lh t6, 116(t1) // wsptr[5]
  3229. lh t7, 108(t1) // wsptr[3]
  3230. lh t8, 100(t1) // wsptr[1]
  3231. ins t5, t6, 16, 16
  3232. ins t7, t8, 16, 16
  3233. mult $ac0, zero, zero
  3234. dpa.w.ph $ac0, t5, s0
  3235. dpa.w.ph $ac0, t7, s1
  3236. mult $ac1, zero, zero
  3237. dpa.w.ph $ac1, t5, s2
  3238. dpa.w.ph $ac1, t7, s3
  3239. sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
  3240. mflo s6, $ac0
  3241. // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
  3242. subu s4, s4, s5
  3243. addu t3, t2, s4 // tmp10 = tmp0 + z2;
  3244. mflo s7, $ac1
  3245. subu t4, t2, s4 // tmp10 = tmp0 - z2;
  3246. addu t7, t4, s6
  3247. subu t8, t4, s6
  3248. addu t5, t3, s7
  3249. subu t6, t3, s7
  3250. shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
  3251. shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
  3252. shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
  3253. shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
  3254. sll s4, t9, 2
  3255. lw v0, 12(a2) // output_buf[ctr]
  3256. shll_s.w t5, t5, 24
  3257. shll_s.w t6, t6, 24
  3258. shll_s.w t7, t7, 24
  3259. shll_s.w t8, t8, 24
  3260. sra t5, t5, 24
  3261. sra t6, t6, 24
  3262. sra t7, t7, 24
  3263. sra t8, t8, 24
  3264. addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
  3265. addiu t5, t5, 128
  3266. addiu t6, t6, 128
  3267. addiu t7, t7, 128
  3268. addiu t8, t8, 128
  3269. sb t5, 0(v0)
  3270. sb t7, 1(v0)
  3271. sb t8, 2(v0)
  3272. sb t6, 3(v0)
  3273. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  3274. j ra
  3275. nop
  3276. END(jsimd_idct_4x4_dspr2)
  3277. /*****************************************************************************/
  3278. LEAF_DSPR2(jsimd_idct_6x6_dspr2)
  3279. /*
  3280. * a0 = compptr->dct_table
  3281. * a1 = coef_block
  3282. * a2 = output_buf
  3283. * a3 = output_col
  3284. */
  3285. .set at
  3286. SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  3287. addiu sp, sp, -144
  3288. move v0, sp
  3289. addiu v1, v0, 24
  3290. addiu t9, zero, 5793
  3291. addiu s0, zero, 10033
  3292. addiu s1, zero, 2998
  3293. 1:
  3294. lh s2, 0(a0) // q0 = quantptr[ 0]
  3295. lh s3, 32(a0) // q1 = quantptr[16]
  3296. lh s4, 64(a0) // q2 = quantptr[32]
  3297. lh t2, 64(a1) // tmp2 = inptr[32]
  3298. lh t1, 32(a1) // tmp1 = inptr[16]
  3299. lh t0, 0(a1) // tmp0 = inptr[ 0]
  3300. mul t2, t2, s4 // tmp2 = tmp2 * q2
  3301. mul t1, t1, s3 // tmp1 = tmp1 * q1
  3302. mul t0, t0, s2 // tmp0 = tmp0 * q0
  3303. lh t6, 16(a1) // z1 = inptr[ 8]
  3304. lh t8, 80(a1) // z3 = inptr[40]
  3305. lh t7, 48(a1) // z2 = inptr[24]
  3306. lh s2, 16(a0) // q0 = quantptr[ 8]
  3307. lh s4, 80(a0) // q2 = quantptr[40]
  3308. lh s3, 48(a0) // q1 = quantptr[24]
  3309. mul t2, t2, t9 // tmp2 = tmp2 * 5793
  3310. mul t1, t1, s0 // tmp1 = tmp1 * 10033
  3311. sll t0, t0, 13 // tmp0 = tmp0 << 13
  3312. mul t6, t6, s2 // z1 = z1 * q0
  3313. mul t8, t8, s4 // z3 = z3 * q2
  3314. mul t7, t7, s3 // z2 = z2 * q1
  3315. addu t3, t0, t2 // tmp10 = tmp0 + tmp2
  3316. sll t2, t2, 1 // tmp2 = tmp2 << 2
  3317. subu t4, t0, t2 // tmp11 = tmp0 - tmp2;
  3318. subu t5, t3, t1 // tmp12 = tmp10 - tmp1
  3319. addu t3, t3, t1 // tmp10 = tmp10 + tmp1
  3320. addu t1, t6, t8 // tmp1 = z1 + z3
  3321. mul t1, t1, s1 // tmp1 = tmp1 * 2998
  3322. shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
  3323. subu t2, t6, t8 // tmp2 = z1 - z3
  3324. subu t2, t2, t7 // tmp2 = tmp2 - z2
  3325. sll t2, t2, 2 // tmp2 = tmp2 << 2
  3326. addu t0, t6, t7 // tmp0 = z1 + z2
  3327. sll t0, t0, 13 // tmp0 = tmp0 << 13
  3328. subu s2, t8, t7 // q0 = z3 - z2
  3329. sll s2, s2, 13 // q0 = q0 << 13
  3330. addu t0, t0, t1 // tmp0 = tmp0 + tmp1
  3331. addu t1, s2, t1 // tmp1 = q0 + tmp1
  3332. addu s2, t4, t2 // q0 = tmp11 + tmp2
  3333. subu s3, t4, t2 // q1 = tmp11 - tmp2
  3334. addu t6, t3, t0 // z1 = tmp10 + tmp0
  3335. subu t7, t3, t0 // z2 = tmp10 - tmp0
  3336. addu t4, t5, t1 // tmp11 = tmp12 + tmp1
  3337. subu t5, t5, t1 // tmp12 = tmp12 - tmp1
  3338. shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11
  3339. shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11
  3340. shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
  3341. shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11
  3342. sw s2, 24(v0)
  3343. sw s3, 96(v0)
  3344. sw t6, 0(v0)
  3345. sw t7, 120(v0)
  3346. sw t4, 48(v0)
  3347. sw t5, 72(v0)
  3348. addiu v0, v0, 4
  3349. addiu a1, a1, 2
  3350. bne v0, v1, 1b
  3351. addiu a0, a0, 2
  3352. /* Pass 2: process 6 rows from work array, store into output array. */
  3353. move v0, sp
  3354. addiu v1, v0, 144
  3355. 2:
  3356. lw t0, 0(v0)
  3357. lw t2, 16(v0)
  3358. lw s5, 0(a2)
  3359. addiu t0, t0, 16
  3360. sll t0, t0, 13
  3361. mul t3, t2, t9
  3362. lw t6, 4(v0)
  3363. lw t8, 20(v0)
  3364. lw t7, 12(v0)
  3365. addu s5, s5, a3
  3366. addu s6, t6, t8
  3367. mul s6, s6, s1
  3368. addu t1, t0, t3
  3369. subu t4, t0, t3
  3370. subu t4, t4, t3
  3371. lw t3, 8(v0)
  3372. mul t0, t3, s0
  3373. addu s7, t6, t7
  3374. sll s7, s7, 13
  3375. addu s7, s6, s7
  3376. subu t2, t8, t7
  3377. sll t2, t2, 13
  3378. addu t2, s6, t2
  3379. subu s6, t6, t7
  3380. subu s6, s6, t8
  3381. sll s6, s6, 13
  3382. addu t3, t1, t0
  3383. subu t5, t1, t0
  3384. addu t6, t3, s7
  3385. subu t3, t3, s7
  3386. addu t7, t4, s6
  3387. subu t4, t4, s6
  3388. addu t8, t5, t2
  3389. subu t5, t5, t2
  3390. shll_s.w t6, t6, 6
  3391. shll_s.w t3, t3, 6
  3392. shll_s.w t7, t7, 6
  3393. shll_s.w t4, t4, 6
  3394. shll_s.w t8, t8, 6
  3395. shll_s.w t5, t5, 6
  3396. sra t6, t6, 24
  3397. addiu t6, t6, 128
  3398. sra t3, t3, 24
  3399. addiu t3, t3, 128
  3400. sb t6, 0(s5)
  3401. sra t7, t7, 24
  3402. addiu t7, t7, 128
  3403. sb t3, 5(s5)
  3404. sra t4, t4, 24
  3405. addiu t4, t4, 128
  3406. sb t7, 1(s5)
  3407. sra t8, t8, 24
  3408. addiu t8, t8, 128
  3409. sb t4, 4(s5)
  3410. addiu v0, v0, 24
  3411. sra t5, t5, 24
  3412. addiu t5, t5, 128
  3413. sb t8, 2(s5)
  3414. addiu a2, a2, 4
  3415. bne v0, v1, 2b
  3416. sb t5, 3(s5)
  3417. addiu sp, sp, 144
  3418. RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
  3419. j ra
  3420. nop
  3421. END(jsimd_idct_6x6_dspr2)
  3422. /*****************************************************************************/
  3423. LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
  3424. /*
  3425. * a0 = compptr->dct_table
  3426. * a1 = coef_block
  3427. * a2 = workspace
  3428. */
  3429. SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
  3430. li a3, 8
  3431. 1:
  3432. // odd part
  3433. lh t0, 48(a1)
  3434. lh t1, 48(a0)
  3435. lh t2, 16(a1)
  3436. lh t3, 16(a0)
  3437. lh t4, 80(a1)
  3438. lh t5, 80(a0)
  3439. lh t6, 112(a1)
  3440. lh t7, 112(a0)
  3441. mul t0, t0, t1 // z2
  3442. mul t1, t2, t3 // z1
  3443. mul t2, t4, t5 // z3
  3444. mul t3, t6, t7 // z4
  3445. li t4, 10703 // FIX(1.306562965)
  3446. li t5, 4433 // FIX_0_541196100
  3447. li t6, 7053 // FIX(0.860918669)
  3448. mul t4, t0, t4 // tmp11
  3449. mul t5, t0, t5 // -tmp14
  3450. addu t7, t1, t2 // tmp10
  3451. addu t8, t7, t3 // tmp10 + z4
  3452. mul t6, t6, t8 // tmp15
  3453. li t8, 2139 // FIX(0.261052384)
  3454. mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384))
  3455. li t7, 2295 // FIX(0.280143716)
  3456. mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716))
  3457. addu t9, t2, t3 // z3 + z4
  3458. li s0, 8565 // FIX(1.045510580)
  3459. mul t9, t9, s0 // -tmp13
  3460. li s0, 12112 // FIX(1.478575242)
  3461. mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)
  3462. li s1, 12998 // FIX(1.586706681)
  3463. mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
  3464. li s2, 5540 // FIX(0.676326758)
  3465. mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
  3466. li s3, 16244 // FIX(1.982889723)
  3467. mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
  3468. subu t1, t1, t3 // z1-=z4
  3469. subu t0, t0, t2 // z2-=z3
  3470. addu t2, t0, t1 // z1+z2
  3471. li t3, 4433 // FIX_0_541196100
  3472. mul t2, t2, t3 // z3
  3473. li t3, 6270 // FIX_0_765366865
  3474. mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
  3475. li t3, 15137 // FIX_0_765366865
  3476. mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
  3477. addu t8, t6, t8 // tmp12
  3478. addu t3, t8, t4 // tmp12 + tmp11
  3479. addu t3, t3, t7 // tmp10
  3480. subu t8, t8, t9 // tmp12 + tmp13
  3481. addu s0, t5, s0
  3482. subu t8, t8, s0 // tmp12
  3483. subu t9, t6, t9
  3484. subu s1, s1, t4
  3485. addu t9, t9, s1 // tmp13
  3486. subu t6, t6, t5
  3487. subu t6, t6, s2
  3488. subu t6, t6, s3 // tmp15
  3489. // even part start
  3490. lh t4, 64(a1)
  3491. lh t5, 64(a0)
  3492. lh t7, 32(a1)
  3493. lh s0, 32(a0)
  3494. lh s1, 0(a1)
  3495. lh s2, 0(a0)
  3496. lh s3, 96(a1)
  3497. lh v0, 96(a0)
  3498. mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
  3499. mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
  3500. mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
  3501. mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
  3502. // odd part end
  3503. addu t1, t2, t1 // tmp11
  3504. subu t0, t2, t0 // tmp14
  3505. // update counter and pointers
  3506. addiu a3, a3, -1
  3507. addiu a0, a0, 2
  3508. addiu a1, a1, 2
  3509. // even part rest
  3510. li s1, 10033
  3511. li s2, 11190
  3512. mul t4, t4, s1 // z4
  3513. mul s1, t5, s2 // z4
  3514. sll t5, t5, 13 // z1
  3515. sll t7, t7, 13
  3516. addiu t7, t7, 1024 // z3
  3517. sll s0, s0, 13 // z2
  3518. addu s2, t7, t4 // tmp10
  3519. subu t4, t7, t4 // tmp11
  3520. subu s3, t5, s0 // tmp12
  3521. addu t2, t7, s3 // tmp21
  3522. subu s3, t7, s3 // tmp24
  3523. addu t7, s1, s0 // tmp12
  3524. addu v0, s2, t7 // tmp20
  3525. subu s2, s2, t7 // tmp25
  3526. subu s1, s1, t5 // z4 - z1
  3527. subu s1, s1, s0 // tmp12
  3528. addu s0, t4, s1 // tmp22
  3529. subu t4, t4, s1 // tmp23
  3530. // final output stage
  3531. addu t5, v0, t3
  3532. subu v0, v0, t3
  3533. addu t3, t2, t1
  3534. subu t2, t2, t1
  3535. addu t1, s0, t8
  3536. subu s0, s0, t8
  3537. addu t8, t4, t9
  3538. subu t4, t4, t9
  3539. addu t9, s3, t0
  3540. subu s3, s3, t0
  3541. addu t0, s2, t6
  3542. subu s2, s2, t6
  3543. sra t5, t5, 11
  3544. sra t3, t3, 11
  3545. sra t1, t1, 11
  3546. sra t8, t8, 11
  3547. sra t9, t9, 11
  3548. sra t0, t0, 11
  3549. sra s2, s2, 11
  3550. sra s3, s3, 11
  3551. sra t4, t4, 11
  3552. sra s0, s0, 11
  3553. sra t2, t2, 11
  3554. sra v0, v0, 11
  3555. sw t5, 0(a2)
  3556. sw t3, 32(a2)
  3557. sw t1, 64(a2)
  3558. sw t8, 96(a2)
  3559. sw t9, 128(a2)
  3560. sw t0, 160(a2)
  3561. sw s2, 192(a2)
  3562. sw s3, 224(a2)
  3563. sw t4, 256(a2)
  3564. sw s0, 288(a2)
  3565. sw t2, 320(a2)
  3566. sw v0, 352(a2)
  3567. bgtz a3, 1b
  3568. addiu a2, a2, 4
  3569. RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
  3570. j ra
  3571. nop
  3572. END(jsimd_idct_12x12_pass1_dspr2)
  3573. /*****************************************************************************/
  3574. LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
  3575. /*
  3576. * a0 = workspace
  3577. * a1 = output
  3578. */
  3579. SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
  3580. li a3, 12
  3581. 1:
  3582. // Odd part
  3583. lw t0, 12(a0)
  3584. lw t1, 4(a0)
  3585. lw t2, 20(a0)
  3586. lw t3, 28(a0)
  3587. li t4, 10703 // FIX(1.306562965)
  3588. li t5, 4433 // FIX_0_541196100
  3589. mul t4, t0, t4 // tmp11
  3590. mul t5, t0, t5 // -tmp14
  3591. addu t6, t1, t2 // tmp10
  3592. li t7, 2139 // FIX(0.261052384)
  3593. mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384))
  3594. addu t6, t6, t3 // tmp10 + z4
  3595. li t8, 7053 // FIX(0.860918669)
  3596. mul t6, t6, t8 // tmp15
  3597. li t8, 2295 // FIX(0.280143716)
  3598. mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716))
  3599. addu t9, t2, t3 // z3 + z4
  3600. li s0, 8565 // FIX(1.045510580)
  3601. mul t9, t9, s0 // -tmp13
  3602. li s0, 12112 // FIX(1.478575242)
  3603. mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242))
  3604. li s1, 12998 // FIX(1.586706681)
  3605. mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
  3606. li s2, 5540 // FIX(0.676326758)
  3607. mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
  3608. li s3, 16244 // FIX(1.982889723)
  3609. mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
  3610. subu t1, t1, t3 // z1 -= z4
  3611. subu t0, t0, t2 // z2 -= z3
  3612. addu t2, t1, t0 // z1 + z2
  3613. li t3, 4433 // FIX_0_541196100
  3614. mul t2, t2, t3 // z3
  3615. li t3, 6270 // FIX_0_765366865
  3616. mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
  3617. li t3, 15137 // FIX_1_847759065
  3618. mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
  3619. addu t3, t6, t7 // tmp12
  3620. addu t7, t3, t4
  3621. addu t7, t7, t8 // tmp10
  3622. subu t3, t3, t9
  3623. subu t3, t3, t5
  3624. subu t3, t3, s0 // tmp12
  3625. subu t9, t6, t9
  3626. subu t9, t9, t4
  3627. addu t9, t9, s1 // tmp13
  3628. subu t6, t6, t5
  3629. subu t6, t6, s2
  3630. subu t6, t6, s3 // tmp15
  3631. addu t1, t2, t1 // tmp11
  3632. subu t0, t2, t0 // tmp14
  3633. // even part
  3634. lw t2, 16(a0) // z4
  3635. lw t4, 8(a0) // z1
  3636. lw t5, 0(a0) // z3
  3637. lw t8, 24(a0) // z2
  3638. li s0, 10033 // FIX(1.224744871)
  3639. li s1, 11190 // FIX(1.366025404)
  3640. mul t2, t2, s0 // z4
  3641. mul s0, t4, s1 // z4
  3642. addiu t5, t5, 0x10
  3643. sll t5, t5, 13 // z3
  3644. sll t4, t4, 13 // z1
  3645. sll t8, t8, 13 // z2
  3646. subu s1, t4, t8 // tmp12
  3647. addu s2, t5, t2 // tmp10
  3648. subu t2, t5, t2 // tmp11
  3649. addu s3, t5, s1 // tmp21
  3650. subu s1, t5, s1 // tmp24
  3651. addu t5, s0, t8 // tmp12
  3652. addu v0, s2, t5 // tmp20
  3653. subu t5, s2, t5 // tmp25
  3654. subu t4, s0, t4
  3655. subu t4, t4, t8 // tmp12
  3656. addu t8, t2, t4 // tmp22
  3657. subu t2, t2, t4 // tmp23
  3658. // increment counter and pointers
  3659. addiu a3, a3, -1
  3660. addiu a0, a0, 32
  3661. // Final stage
  3662. addu t4, v0, t7
  3663. subu v0, v0, t7
  3664. addu t7, s3, t1
  3665. subu s3, s3, t1
  3666. addu t1, t8, t3
  3667. subu t8, t8, t3
  3668. addu t3, t2, t9
  3669. subu t2, t2, t9
  3670. addu t9, s1, t0
  3671. subu s1, s1, t0
  3672. addu t0, t5, t6
  3673. subu t5, t5, t6
  3674. sll t4, t4, 4
  3675. sll t7, t7, 4
  3676. sll t1, t1, 4
  3677. sll t3, t3, 4
  3678. sll t9, t9, 4
  3679. sll t0, t0, 4
  3680. sll t5, t5, 4
  3681. sll s1, s1, 4
  3682. sll t2, t2, 4
  3683. sll t8, t8, 4
  3684. sll s3, s3, 4
  3685. sll v0, v0, 4
  3686. shll_s.w t4, t4, 2
  3687. shll_s.w t7, t7, 2
  3688. shll_s.w t1, t1, 2
  3689. shll_s.w t3, t3, 2
  3690. shll_s.w t9, t9, 2
  3691. shll_s.w t0, t0, 2
  3692. shll_s.w t5, t5, 2
  3693. shll_s.w s1, s1, 2
  3694. shll_s.w t2, t2, 2
  3695. shll_s.w t8, t8, 2
  3696. shll_s.w s3, s3, 2
  3697. shll_s.w v0, v0, 2
  3698. srl t4, t4, 24
  3699. srl t7, t7, 24
  3700. srl t1, t1, 24
  3701. srl t3, t3, 24
  3702. srl t9, t9, 24
  3703. srl t0, t0, 24
  3704. srl t5, t5, 24
  3705. srl s1, s1, 24
  3706. srl t2, t2, 24
  3707. srl t8, t8, 24
  3708. srl s3, s3, 24
  3709. srl v0, v0, 24
  3710. lw t6, 0(a1)
  3711. addiu t4, t4, 0x80
  3712. addiu t7, t7, 0x80
  3713. addiu t1, t1, 0x80
  3714. addiu t3, t3, 0x80
  3715. addiu t9, t9, 0x80
  3716. addiu t0, t0, 0x80
  3717. addiu t5, t5, 0x80
  3718. addiu s1, s1, 0x80
  3719. addiu t2, t2, 0x80
  3720. addiu t8, t8, 0x80
  3721. addiu s3, s3, 0x80
  3722. addiu v0, v0, 0x80
  3723. sb t4, 0(t6)
  3724. sb t7, 1(t6)
  3725. sb t1, 2(t6)
  3726. sb t3, 3(t6)
  3727. sb t9, 4(t6)
  3728. sb t0, 5(t6)
  3729. sb t5, 6(t6)
  3730. sb s1, 7(t6)
  3731. sb t2, 8(t6)
  3732. sb t8, 9(t6)
  3733. sb s3, 10(t6)
  3734. sb v0, 11(t6)
  3735. bgtz a3, 1b
  3736. addiu a1, a1, 4
  3737. RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
  3738. jr ra
  3739. nop
  3740. END(jsimd_idct_12x12_pass2_dspr2)
  3741. /*****************************************************************************/
  3742. LEAF_DSPR2(jsimd_convsamp_dspr2)
  3743. /*
  3744. * a0 = sample_data
  3745. * a1 = start_col
  3746. * a2 = workspace
  3747. */
  3748. lw t0, 0(a0)
  3749. li t7, 0xff80ff80
  3750. addu t0, t0, a1
  3751. ulw t1, 0(t0)
  3752. ulw t2, 4(t0)
  3753. preceu.ph.qbr t3, t1
  3754. preceu.ph.qbl t4, t1
  3755. lw t0, 4(a0)
  3756. preceu.ph.qbr t5, t2
  3757. preceu.ph.qbl t6, t2
  3758. addu t0, t0, a1
  3759. addu.ph t3, t3, t7
  3760. addu.ph t4, t4, t7
  3761. ulw t1, 0(t0)
  3762. ulw t2, 4(t0)
  3763. addu.ph t5, t5, t7
  3764. addu.ph t6, t6, t7
  3765. usw t3, 0(a2)
  3766. usw t4, 4(a2)
  3767. preceu.ph.qbr t3, t1
  3768. preceu.ph.qbl t4, t1
  3769. usw t5, 8(a2)
  3770. usw t6, 12(a2)
  3771. lw t0, 8(a0)
  3772. preceu.ph.qbr t5, t2
  3773. preceu.ph.qbl t6, t2
  3774. addu t0, t0, a1
  3775. addu.ph t3, t3, t7
  3776. addu.ph t4, t4, t7
  3777. ulw t1, 0(t0)
  3778. ulw t2, 4(t0)
  3779. addu.ph t5, t5, t7
  3780. addu.ph t6, t6, t7
  3781. usw t3, 16(a2)
  3782. usw t4, 20(a2)
  3783. preceu.ph.qbr t3, t1
  3784. preceu.ph.qbl t4, t1
  3785. usw t5, 24(a2)
  3786. usw t6, 28(a2)
  3787. lw t0, 12(a0)
  3788. preceu.ph.qbr t5, t2
  3789. preceu.ph.qbl t6, t2
  3790. addu t0, t0, a1
  3791. addu.ph t3, t3, t7
  3792. addu.ph t4, t4, t7
  3793. ulw t1, 0(t0)
  3794. ulw t2, 4(t0)
  3795. addu.ph t5, t5, t7
  3796. addu.ph t6, t6, t7
  3797. usw t3, 32(a2)
  3798. usw t4, 36(a2)
  3799. preceu.ph.qbr t3, t1
  3800. preceu.ph.qbl t4, t1
  3801. usw t5, 40(a2)
  3802. usw t6, 44(a2)
  3803. lw t0, 16(a0)
  3804. preceu.ph.qbr t5, t2
  3805. preceu.ph.qbl t6, t2
  3806. addu t0, t0, a1
  3807. addu.ph t3, t3, t7
  3808. addu.ph t4, t4, t7
  3809. ulw t1, 0(t0)
  3810. ulw t2, 4(t0)
  3811. addu.ph t5, t5, t7
  3812. addu.ph t6, t6, t7
  3813. usw t3, 48(a2)
  3814. usw t4, 52(a2)
  3815. preceu.ph.qbr t3, t1
  3816. preceu.ph.qbl t4, t1
  3817. usw t5, 56(a2)
  3818. usw t6, 60(a2)
  3819. lw t0, 20(a0)
  3820. preceu.ph.qbr t5, t2
  3821. preceu.ph.qbl t6, t2
  3822. addu t0, t0, a1
  3823. addu.ph t3, t3, t7
  3824. addu.ph t4, t4, t7
  3825. ulw t1, 0(t0)
  3826. ulw t2, 4(t0)
  3827. addu.ph t5, t5, t7
  3828. addu.ph t6, t6, t7
  3829. usw t3, 64(a2)
  3830. usw t4, 68(a2)
  3831. preceu.ph.qbr t3, t1
  3832. preceu.ph.qbl t4, t1
  3833. usw t5, 72(a2)
  3834. usw t6, 76(a2)
  3835. lw t0, 24(a0)
  3836. preceu.ph.qbr t5, t2
  3837. preceu.ph.qbl t6, t2
  3838. addu t0, t0, a1
  3839. addu.ph t3, t3, t7
  3840. addu.ph t4, t4, t7
  3841. ulw t1, 0(t0)
  3842. ulw t2, 4(t0)
  3843. addu.ph t5, t5, t7
  3844. addu.ph t6, t6, t7
  3845. usw t3, 80(a2)
  3846. usw t4, 84(a2)
  3847. preceu.ph.qbr t3, t1
  3848. preceu.ph.qbl t4, t1
  3849. usw t5, 88(a2)
  3850. usw t6, 92(a2)
  3851. lw t0, 28(a0)
  3852. preceu.ph.qbr t5, t2
  3853. preceu.ph.qbl t6, t2
  3854. addu t0, t0, a1
  3855. addu.ph t3, t3, t7
  3856. addu.ph t4, t4, t7
  3857. ulw t1, 0(t0)
  3858. ulw t2, 4(t0)
  3859. addu.ph t5, t5, t7
  3860. addu.ph t6, t6, t7
  3861. usw t3, 96(a2)
  3862. usw t4, 100(a2)
  3863. preceu.ph.qbr t3, t1
  3864. preceu.ph.qbl t4, t1
  3865. usw t5, 104(a2)
  3866. usw t6, 108(a2)
  3867. preceu.ph.qbr t5, t2
  3868. preceu.ph.qbl t6, t2
  3869. addu.ph t3, t3, t7
  3870. addu.ph t4, t4, t7
  3871. addu.ph t5, t5, t7
  3872. addu.ph t6, t6, t7
  3873. usw t3, 112(a2)
  3874. usw t4, 116(a2)
  3875. usw t5, 120(a2)
  3876. usw t6, 124(a2)
  3877. j ra
  3878. nop
  3879. END(jsimd_convsamp_dspr2)
  3880. #ifndef __mips_soft_float
  3881. /*****************************************************************************/
  3882. LEAF_DSPR2(jsimd_convsamp_float_dspr2)
  3883. /*
  3884. * a0 = sample_data
  3885. * a1 = start_col
  3886. * a2 = workspace
  3887. */
  3888. .set at
  3889. lw t0, 0(a0)
  3890. addu t0, t0, a1
  3891. lbu t1, 0(t0)
  3892. lbu t2, 1(t0)
  3893. lbu t3, 2(t0)
  3894. lbu t4, 3(t0)
  3895. lbu t5, 4(t0)
  3896. lbu t6, 5(t0)
  3897. lbu t7, 6(t0)
  3898. lbu t8, 7(t0)
  3899. addiu t1, t1, -128
  3900. addiu t2, t2, -128
  3901. addiu t3, t3, -128
  3902. addiu t4, t4, -128
  3903. addiu t5, t5, -128
  3904. addiu t6, t6, -128
  3905. addiu t7, t7, -128
  3906. addiu t8, t8, -128
  3907. mtc1 t1, f2
  3908. mtc1 t2, f4
  3909. mtc1 t3, f6
  3910. mtc1 t4, f8
  3911. mtc1 t5, f10
  3912. mtc1 t6, f12
  3913. mtc1 t7, f14
  3914. mtc1 t8, f16
  3915. cvt.s.w f2, f2
  3916. cvt.s.w f4, f4
  3917. cvt.s.w f6, f6
  3918. cvt.s.w f8, f8
  3919. cvt.s.w f10, f10
  3920. cvt.s.w f12, f12
  3921. cvt.s.w f14, f14
  3922. cvt.s.w f16, f16
  3923. lw t0, 4(a0)
  3924. swc1 f2, 0(a2)
  3925. swc1 f4, 4(a2)
  3926. swc1 f6, 8(a2)
  3927. addu t0, t0, a1
  3928. swc1 f8, 12(a2)
  3929. swc1 f10, 16(a2)
  3930. swc1 f12, 20(a2)
  3931. swc1 f14, 24(a2)
  3932. swc1 f16, 28(a2)
  3933. // elemr 1
  3934. lbu t1, 0(t0)
  3935. lbu t2, 1(t0)
  3936. lbu t3, 2(t0)
  3937. lbu t4, 3(t0)
  3938. lbu t5, 4(t0)
  3939. lbu t6, 5(t0)
  3940. lbu t7, 6(t0)
  3941. lbu t8, 7(t0)
  3942. addiu t1, t1, -128
  3943. addiu t2, t2, -128
  3944. addiu t3, t3, -128
  3945. addiu t4, t4, -128
  3946. addiu t5, t5, -128
  3947. addiu t6, t6, -128
  3948. addiu t7, t7, -128
  3949. addiu t8, t8, -128
  3950. mtc1 t1, f2
  3951. mtc1 t2, f4
  3952. mtc1 t3, f6
  3953. mtc1 t4, f8
  3954. mtc1 t5, f10
  3955. mtc1 t6, f12
  3956. mtc1 t7, f14
  3957. mtc1 t8, f16
  3958. cvt.s.w f2, f2
  3959. cvt.s.w f4, f4
  3960. cvt.s.w f6, f6
  3961. cvt.s.w f8, f8
  3962. cvt.s.w f10, f10
  3963. cvt.s.w f12, f12
  3964. cvt.s.w f14, f14
  3965. cvt.s.w f16, f16
  3966. lw t0, 8(a0)
  3967. swc1 f2, 32(a2)
  3968. swc1 f4, 36(a2)
  3969. swc1 f6, 40(a2)
  3970. addu t0, t0, a1
  3971. swc1 f8, 44(a2)
  3972. swc1 f10, 48(a2)
  3973. swc1 f12, 52(a2)
  3974. swc1 f14, 56(a2)
  3975. swc1 f16, 60(a2)
  3976. // elemr 2
  3977. lbu t1, 0(t0)
  3978. lbu t2, 1(t0)
  3979. lbu t3, 2(t0)
  3980. lbu t4, 3(t0)
  3981. lbu t5, 4(t0)
  3982. lbu t6, 5(t0)
  3983. lbu t7, 6(t0)
  3984. lbu t8, 7(t0)
  3985. addiu t1, t1, -128
  3986. addiu t2, t2, -128
  3987. addiu t3, t3, -128
  3988. addiu t4, t4, -128
  3989. addiu t5, t5, -128
  3990. addiu t6, t6, -128
  3991. addiu t7, t7, -128
  3992. addiu t8, t8, -128
  3993. mtc1 t1, f2
  3994. mtc1 t2, f4
  3995. mtc1 t3, f6
  3996. mtc1 t4, f8
  3997. mtc1 t5, f10
  3998. mtc1 t6, f12
  3999. mtc1 t7, f14
  4000. mtc1 t8, f16
  4001. cvt.s.w f2, f2
  4002. cvt.s.w f4, f4
  4003. cvt.s.w f6, f6
  4004. cvt.s.w f8, f8
  4005. cvt.s.w f10, f10
  4006. cvt.s.w f12, f12
  4007. cvt.s.w f14, f14
  4008. cvt.s.w f16, f16
  4009. lw t0, 12(a0)
  4010. swc1 f2, 64(a2)
  4011. swc1 f4, 68(a2)
  4012. swc1 f6, 72(a2)
  4013. addu t0, t0, a1
  4014. swc1 f8, 76(a2)
  4015. swc1 f10, 80(a2)
  4016. swc1 f12, 84(a2)
  4017. swc1 f14, 88(a2)
  4018. swc1 f16, 92(a2)
  4019. // elemr 3
  4020. lbu t1, 0(t0)
  4021. lbu t2, 1(t0)
  4022. lbu t3, 2(t0)
  4023. lbu t4, 3(t0)
  4024. lbu t5, 4(t0)
  4025. lbu t6, 5(t0)
  4026. lbu t7, 6(t0)
  4027. lbu t8, 7(t0)
  4028. addiu t1, t1, -128
  4029. addiu t2, t2, -128
  4030. addiu t3, t3, -128
  4031. addiu t4, t4, -128
  4032. addiu t5, t5, -128
  4033. addiu t6, t6, -128
  4034. addiu t7, t7, -128
  4035. addiu t8, t8, -128
  4036. mtc1 t1, f2
  4037. mtc1 t2, f4
  4038. mtc1 t3, f6
  4039. mtc1 t4, f8
  4040. mtc1 t5, f10
  4041. mtc1 t6, f12
  4042. mtc1 t7, f14
  4043. mtc1 t8, f16
  4044. cvt.s.w f2, f2
  4045. cvt.s.w f4, f4
  4046. cvt.s.w f6, f6
  4047. cvt.s.w f8, f8
  4048. cvt.s.w f10, f10
  4049. cvt.s.w f12, f12
  4050. cvt.s.w f14, f14
  4051. cvt.s.w f16, f16
  4052. lw t0, 16(a0)
  4053. swc1 f2, 96(a2)
  4054. swc1 f4, 100(a2)
  4055. swc1 f6, 104(a2)
  4056. addu t0, t0, a1
  4057. swc1 f8, 108(a2)
  4058. swc1 f10, 112(a2)
  4059. swc1 f12, 116(a2)
  4060. swc1 f14, 120(a2)
  4061. swc1 f16, 124(a2)
  4062. // elemr 4
  4063. lbu t1, 0(t0)
  4064. lbu t2, 1(t0)
  4065. lbu t3, 2(t0)
  4066. lbu t4, 3(t0)
  4067. lbu t5, 4(t0)
  4068. lbu t6, 5(t0)
  4069. lbu t7, 6(t0)
  4070. lbu t8, 7(t0)
  4071. addiu t1, t1, -128
  4072. addiu t2, t2, -128
  4073. addiu t3, t3, -128
  4074. addiu t4, t4, -128
  4075. addiu t5, t5, -128
  4076. addiu t6, t6, -128
  4077. addiu t7, t7, -128
  4078. addiu t8, t8, -128
  4079. mtc1 t1, f2
  4080. mtc1 t2, f4
  4081. mtc1 t3, f6
  4082. mtc1 t4, f8
  4083. mtc1 t5, f10
  4084. mtc1 t6, f12
  4085. mtc1 t7, f14
  4086. mtc1 t8, f16
  4087. cvt.s.w f2, f2
  4088. cvt.s.w f4, f4
  4089. cvt.s.w f6, f6
  4090. cvt.s.w f8, f8
  4091. cvt.s.w f10, f10
  4092. cvt.s.w f12, f12
  4093. cvt.s.w f14, f14
  4094. cvt.s.w f16, f16
  4095. lw t0, 20(a0)
  4096. swc1 f2, 128(a2)
  4097. swc1 f4, 132(a2)
  4098. swc1 f6, 136(a2)
  4099. addu t0, t0, a1
  4100. swc1 f8, 140(a2)
  4101. swc1 f10, 144(a2)
  4102. swc1 f12, 148(a2)
  4103. swc1 f14, 152(a2)
  4104. swc1 f16, 156(a2)
  4105. // elemr 5
  4106. lbu t1, 0(t0)
  4107. lbu t2, 1(t0)
  4108. lbu t3, 2(t0)
  4109. lbu t4, 3(t0)
  4110. lbu t5, 4(t0)
  4111. lbu t6, 5(t0)
  4112. lbu t7, 6(t0)
  4113. lbu t8, 7(t0)
  4114. addiu t1, t1, -128
  4115. addiu t2, t2, -128
  4116. addiu t3, t3, -128
  4117. addiu t4, t4, -128
  4118. addiu t5, t5, -128
  4119. addiu t6, t6, -128
  4120. addiu t7, t7, -128
  4121. addiu t8, t8, -128
  4122. mtc1 t1, f2
  4123. mtc1 t2, f4
  4124. mtc1 t3, f6
  4125. mtc1 t4, f8
  4126. mtc1 t5, f10
  4127. mtc1 t6, f12
  4128. mtc1 t7, f14
  4129. mtc1 t8, f16
  4130. cvt.s.w f2, f2
  4131. cvt.s.w f4, f4
  4132. cvt.s.w f6, f6
  4133. cvt.s.w f8, f8
  4134. cvt.s.w f10, f10
  4135. cvt.s.w f12, f12
  4136. cvt.s.w f14, f14
  4137. cvt.s.w f16, f16
  4138. lw t0, 24(a0)
  4139. swc1 f2, 160(a2)
  4140. swc1 f4, 164(a2)
  4141. swc1 f6, 168(a2)
  4142. addu t0, t0, a1
  4143. swc1 f8, 172(a2)
  4144. swc1 f10, 176(a2)
  4145. swc1 f12, 180(a2)
  4146. swc1 f14, 184(a2)
  4147. swc1 f16, 188(a2)
  4148. // elemr 6
  4149. lbu t1, 0(t0)
  4150. lbu t2, 1(t0)
  4151. lbu t3, 2(t0)
  4152. lbu t4, 3(t0)
  4153. lbu t5, 4(t0)
  4154. lbu t6, 5(t0)
  4155. lbu t7, 6(t0)
  4156. lbu t8, 7(t0)
  4157. addiu t1, t1, -128
  4158. addiu t2, t2, -128
  4159. addiu t3, t3, -128
  4160. addiu t4, t4, -128
  4161. addiu t5, t5, -128
  4162. addiu t6, t6, -128
  4163. addiu t7, t7, -128
  4164. addiu t8, t8, -128
  4165. mtc1 t1, f2
  4166. mtc1 t2, f4
  4167. mtc1 t3, f6
  4168. mtc1 t4, f8
  4169. mtc1 t5, f10
  4170. mtc1 t6, f12
  4171. mtc1 t7, f14
  4172. mtc1 t8, f16
  4173. cvt.s.w f2, f2
  4174. cvt.s.w f4, f4
  4175. cvt.s.w f6, f6
  4176. cvt.s.w f8, f8
  4177. cvt.s.w f10, f10
  4178. cvt.s.w f12, f12
  4179. cvt.s.w f14, f14
  4180. cvt.s.w f16, f16
  4181. lw t0, 28(a0)
  4182. swc1 f2, 192(a2)
  4183. swc1 f4, 196(a2)
  4184. swc1 f6, 200(a2)
  4185. addu t0, t0, a1
  4186. swc1 f8, 204(a2)
  4187. swc1 f10, 208(a2)
  4188. swc1 f12, 212(a2)
  4189. swc1 f14, 216(a2)
  4190. swc1 f16, 220(a2)
  4191. // elemr 7
  4192. lbu t1, 0(t0)
  4193. lbu t2, 1(t0)
  4194. lbu t3, 2(t0)
  4195. lbu t4, 3(t0)
  4196. lbu t5, 4(t0)
  4197. lbu t6, 5(t0)
  4198. lbu t7, 6(t0)
  4199. lbu t8, 7(t0)
  4200. addiu t1, t1, -128
  4201. addiu t2, t2, -128
  4202. addiu t3, t3, -128
  4203. addiu t4, t4, -128
  4204. addiu t5, t5, -128
  4205. addiu t6, t6, -128
  4206. addiu t7, t7, -128
  4207. addiu t8, t8, -128
  4208. mtc1 t1, f2
  4209. mtc1 t2, f4
  4210. mtc1 t3, f6
  4211. mtc1 t4, f8
  4212. mtc1 t5, f10
  4213. mtc1 t6, f12
  4214. mtc1 t7, f14
  4215. mtc1 t8, f16
  4216. cvt.s.w f2, f2
  4217. cvt.s.w f4, f4
  4218. cvt.s.w f6, f6
  4219. cvt.s.w f8, f8
  4220. cvt.s.w f10, f10
  4221. cvt.s.w f12, f12
  4222. cvt.s.w f14, f14
  4223. cvt.s.w f16, f16
  4224. swc1 f2, 224(a2)
  4225. swc1 f4, 228(a2)
  4226. swc1 f6, 232(a2)
  4227. swc1 f8, 236(a2)
  4228. swc1 f10, 240(a2)
  4229. swc1 f12, 244(a2)
  4230. swc1 f14, 248(a2)
  4231. swc1 f16, 252(a2)
  4232. j ra
  4233. nop
  4234. END(jsimd_convsamp_float_dspr2)
  4235. #endif
  4236. /*****************************************************************************/