DXTEncoder_SSE2.cpp 97 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779
  1. /*
  2. ===========================================================================
  3. Doom 3 BFG Edition GPL Source Code
  4. Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
  5. This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
  6. Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>.
  16. In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below.
  17. If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
  18. ===========================================================================
  19. */
  20. /*
  21. ================================================================================================
  22. Contains the DxtEncoder implementation for SSE2.
  23. ================================================================================================
  24. */
  25. #pragma hdrstop
  26. #include "DXTCodec_local.h"
  27. #include "DXTCodec.h"
  28. #if defined( ID_WIN_X86_SSE2_INTRIN ) || ( ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) )
  29. //#define TEST_COMPRESSION
  30. #ifdef TEST_COMPRESSION
  31. #include <malloc.h>
  32. #endif
  33. #define INSET_COLOR_SHIFT 4 // inset the bounding box with ( range >> shift )
  34. #define INSET_ALPHA_SHIFT 5 // inset alpha channel
  35. #define C565_5_MASK 0xF8 // 0xFF minus last three bits
  36. #define C565_6_MASK 0xFC // 0xFF minus last two bits
  37. #define NVIDIA_7X_HARDWARE_BUG_FIX // keep the DXT5 colors sorted as: max, min
  38. #if !defined( R_SHUFFLE_D )
  39. #define R_SHUFFLE_D( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
  40. #endif
  41. typedef uint16 word;
  42. typedef uint32 dword;
  43. ALIGN16( static __m128i SIMD_SSE2_zero ) = { 0, 0, 0, 0 };
  44. ALIGN16( static dword SIMD_SSE2_dword_byte_mask[4] ) = { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF };
  45. ALIGN16( static dword SIMD_SSE2_dword_word_mask[4] ) = { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF };
  46. ALIGN16( static dword SIMD_SSE2_dword_red_mask[4] ) = { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF };
  47. ALIGN16( static dword SIMD_SSE2_dword_green_mask[4] ) = { 0x0000FF00, 0x0000FF00, 0x0000FF00, 0x0000FF00 };
  48. ALIGN16( static dword SIMD_SSE2_dword_blue_mask[4] ) = { 0x00FF0000, 0x00FF0000, 0x00FF0000, 0x00FF0000 };
  49. ALIGN16( static dword SIMD_SSE2_dword_colorMask_1010[4] ) = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 };
  50. ALIGN16( static dword SIMD_SSE2_dword_colorMask_0100[4] ) = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 };
  51. ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask0[4] ) = { 7<<0, 0, 7<<0, 0 };
  52. ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask1[4] ) = { 7<<3, 0, 7<<3, 0 };
  53. ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask2[4] ) = { 7<<6, 0, 7<<6, 0 };
  54. ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask3[4] ) = { 7<<9, 0, 7<<9, 0 };
  55. ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask4[4] ) = { 7<<12, 0, 7<<12, 0 };
  56. ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask5[4] ) = { 7<<15, 0, 7<<15, 0 };
  57. ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask6[4] ) = { 7<<18, 0, 7<<18, 0 };
  58. ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask7[4] ) = { 7<<21, 0, 7<<21, 0 };
  59. ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask0[4] ) = { 3<<0, 0, 3<<0, 0 };
  60. ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask1[4] ) = { 3<<2, 0, 3<<2, 0 };
  61. ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask2[4] ) = { 3<<4, 0, 3<<4, 0 };
  62. ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask3[4] ) = { 3<<6, 0, 3<<6, 0 };
  63. ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask4[4] ) = { 3<<8, 0, 3<<8, 0 };
  64. ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask5[4] ) = { 3<<10, 0, 3<<10, 0 };
  65. ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask6[4] ) = { 3<<12, 0, 3<<12, 0 };
  66. ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask7[4] ) = { 3<<14, 0, 3<<14, 0 };
  67. ALIGN16( static word SIMD_SSE2_word_0[8] ) = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
  68. ALIGN16( static word SIMD_SSE2_word_1[8] ) = { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 };
  69. ALIGN16( static word SIMD_SSE2_word_2[8] ) = { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 };
  70. ALIGN16( static word SIMD_SSE2_word_3[8] ) = { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 };
  71. ALIGN16( static word SIMD_SSE2_word_7[8] ) = { 0x0007, 0x0007, 0x0007, 0x0007, 0x0007, 0x0007, 0x0007, 0x0007 };
  72. ALIGN16( static word SIMD_SSE2_word_8[8] ) = { 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008 };
  73. ALIGN16( static word SIMD_SSE2_word_31[8] ) = { 31, 31, 31, 31, 31, 31, 31, 31 };
  74. ALIGN16( static word SIMD_SSE2_word_63[8] ) = { 63, 63, 63, 63, 63, 63, 63, 63 };
  75. ALIGN16( static word SIMD_SSE2_word_127[8] ) = { 127, 127, 127, 127, 127, 127, 127, 127 };
  76. ALIGN16( static word SIMD_SSE2_word_255[8] ) = { 255, 255, 255, 255, 255, 255, 255, 255 };
  77. ALIGN16( static word SIMD_SSE2_word_center_128[8] ) = { 128, 128, 0, 0, 0, 0, 0, 0 };
  78. ALIGN16( static word SIMD_SSE2_word_div_by_3[8] ) = { (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1 };
  79. ALIGN16( static word SIMD_SSE2_word_div_by_6[8] ) = { (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1 };
  80. ALIGN16( static word SIMD_SSE2_word_div_by_14[8] ) = { (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1 };
  81. ALIGN16( static word SIMD_SSE2_word_scale_7_9_11_13[8] ) = { 7, 7, 9, 9, 11, 11, 13, 13 };
  82. ALIGN16( static word SIMD_SSE2_word_scale_7_5_3_1[8] ) = { 7, 7, 5, 5, 3, 3, 1, 1 };
  83. ALIGN16( static word SIMD_SSE2_word_scale_5_3_1[8] ) = { 5, 3, 1, 0, 5, 3, 1, 0 };
  84. ALIGN16( static word SIMD_SSE2_word_scale_1_3_5[8] ) = { 1, 3, 5, 0, 1, 3, 5, 0 };
  85. ALIGN16( static word SIMD_SSE2_word_insetShift[8] ) = { 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_ALPHA_SHIFT ), 0, 0, 0, 0 };
  86. ALIGN16( static word SIMD_SSE2_word_insetYCoCgRound[8] ) = { ((1<<(INSET_COLOR_SHIFT-1))-1), ((1<<(INSET_COLOR_SHIFT-1))-1), ((1<<(INSET_COLOR_SHIFT-1))-1), ((1<<(INSET_ALPHA_SHIFT-1))-1), 0, 0, 0, 0 };
  87. ALIGN16( static word SIMD_SSE2_word_insetYCoCgMask[8] ) = { 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF };
  88. ALIGN16( static word SIMD_SSE2_word_insetYCoCgShiftUp[8] ) = { 1 << INSET_COLOR_SHIFT, 1 << INSET_COLOR_SHIFT, 1 << INSET_COLOR_SHIFT, 1 << INSET_ALPHA_SHIFT, 0, 0, 0, 0 };
  89. ALIGN16( static word SIMD_SSE2_word_insetYCoCgShiftDown[8] ) = { 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_ALPHA_SHIFT ), 0, 0, 0, 0 };
  90. ALIGN16( static word SIMD_SSE2_word_insetYCoCgQuantMask[8] ) = { C565_5_MASK, C565_6_MASK, C565_5_MASK, 0xFF, C565_5_MASK, C565_6_MASK, C565_5_MASK, 0xFF };
  91. ALIGN16( static word SIMD_SSE2_word_insetYCoCgRep[8] ) = { 1 << ( 16 - 5 ), 1 << ( 16 - 6 ), 1 << ( 16 - 5 ), 0, 1 << ( 16 - 5 ), 1 << ( 16 - 6 ), 1 << ( 16 - 5 ), 0 };
  92. ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5Round[8] ) = { 0, ((1<<(INSET_COLOR_SHIFT-1))-1), 0, ((1<<(INSET_ALPHA_SHIFT-1))-1), 0, 0, 0, 0 };
  93. ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5Mask[8] ) = { 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000 };
  94. ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5ShiftUp[8] ) = { 1, 1 << INSET_COLOR_SHIFT, 1, 1 << INSET_ALPHA_SHIFT, 1, 1, 1, 1 };
  95. ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5ShiftDown[8] ) = { 0, 1 << ( 16 - INSET_COLOR_SHIFT ), 0, 1 << ( 16 - INSET_ALPHA_SHIFT ), 0, 0, 0, 0 };
  96. ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5QuantMask[8] ) = { 0xFF, C565_6_MASK, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
  97. ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5Rep[8] ) = { 0, 1 << ( 16 - 6 ), 0, 0, 0, 0, 0, 0 };
  98. ALIGN16( static word SIMD_SSE2_word_insetNormal3DcRound[8] ) = { ((1<<(INSET_ALPHA_SHIFT-1))-1), ((1<<(INSET_ALPHA_SHIFT-1))-1), 0, 0, 0, 0, 0, 0 };
  99. ALIGN16( static word SIMD_SSE2_word_insetNormal3DcMask[8] ) = { 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
  100. ALIGN16( static word SIMD_SSE2_word_insetNormal3DcShiftUp[8] ) = { 1 << INSET_ALPHA_SHIFT, 1 << INSET_ALPHA_SHIFT, 1, 1, 1, 1, 1, 1 };
  101. ALIGN16( static word SIMD_SSE2_word_insetNormal3DcShiftDown[8] ) = { 1 << ( 16 - INSET_ALPHA_SHIFT ), 1 << ( 16 - INSET_ALPHA_SHIFT ), 0, 0, 0, 0, 0, 0 };
  102. ALIGN16( static byte SIMD_SSE2_byte_0[16] ) = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
  103. ALIGN16( static byte SIMD_SSE2_byte_1[16] ) = { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 };
  104. ALIGN16( static byte SIMD_SSE2_byte_2[16] ) = { 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 };
  105. ALIGN16( static byte SIMD_SSE2_byte_3[16] ) = { 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 };
  106. ALIGN16( static byte SIMD_SSE2_byte_4[16] ) = { 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 };
  107. ALIGN16( static byte SIMD_SSE2_byte_7[16] ) = { 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 };
  108. ALIGN16( static byte SIMD_SSE2_byte_8[16] ) = { 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 };
  109. ALIGN16( static byte SIMD_SSE2_byte_not[16] ) = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
  110. ALIGN16( static byte SIMD_SSE2_byte_colorMask[16] ) = { C565_5_MASK, C565_6_MASK, C565_5_MASK, 0x00, 0x00, 0x00, 0x00, 0x00, C565_5_MASK, C565_6_MASK, C565_5_MASK, 0x00, 0x00, 0x00, 0x00, 0x00 };
  111. ALIGN16( static byte SIMD_SSE2_byte_colorMask2[16] ) = { 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00 };
  112. ALIGN16( static byte SIMD_SSE2_byte_ctx1Mask[16] ) = { 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
  113. ALIGN16( static byte SIMD_SSE2_byte_diagonalMask[16] ) = { 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
  114. ALIGN16( static byte SIMD_SSE2_byte_scale_mask0[16] ) = { 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF };
  115. ALIGN16( static byte SIMD_SSE2_byte_scale_mask1[16] ) = { 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00 };
  116. ALIGN16( static byte SIMD_SSE2_byte_scale_mask2[16] ) = { 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00 };
  117. ALIGN16( static byte SIMD_SSE2_byte_scale_mask3[16] ) = { 0xFF, 0xFF, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00 };
  118. ALIGN16( static byte SIMD_SSE2_byte_scale_mask4[16] ) = { 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00 };
  119. ALIGN16( static byte SIMD_SSE2_byte_minus_128_0[16] ) = { (byte)-128, (byte)-128, 0, 0, (byte)-128, (byte)-128, 0, 0, (byte)-128, (byte)-128, 0, 0, (byte)-128, (byte)-128, 0, 0 };
  120. /*
  121. ========================
  122. idDxtEncoder::ExtractBlock_SSE2
  123. params: inPtr - input image, 4 bytes per pixel
  124. paramO: colorBlock - 4*4 output tile, 4 bytes per pixel
  125. ========================
  126. */
  127. ID_INLINE void idDxtEncoder::ExtractBlock_SSE2( const byte * inPtr, int width, byte * colorBlock ) const {
  128. #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
  129. __asm {
  130. mov esi, inPtr
  131. mov edi, colorBlock
  132. mov eax, width
  133. shl eax, 2
  134. movdqa xmm0, xmmword ptr [esi]
  135. movdqa xmmword ptr [edi+ 0], xmm0
  136. movdqa xmm1, xmmword ptr [esi+eax] // + 4 * width
  137. movdqa xmmword ptr [edi+16], xmm1
  138. movdqa xmm2, xmmword ptr [esi+eax*2] // + 8 * width
  139. add esi, eax
  140. movdqa xmmword ptr [edi+32], xmm2
  141. movdqa xmm3, xmmword ptr [esi+eax*2] // + 12 * width
  142. movdqa xmmword ptr [edi+48], xmm3
  143. }
  144. #elif defined ( ID_WIN_X86_SSE2_INTRIN )
  145. *((__m128i *)(&colorBlock[ 0])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 0 ) );
  146. *((__m128i *)(&colorBlock[16])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 1 ) );
  147. *((__m128i *)(&colorBlock[32])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 2 ) );
  148. *((__m128i *)(&colorBlock[48])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 3 ) );
  149. #else
  150. assert( false );
  151. #endif
  152. }
  153. /*
  154. ========================
  155. idDxtEncoder::GetMinMaxBBox_SSE2
  156. Takes the extents of the bounding box of the colors in the 4x4 block.
  157. params: colorBlock - 4*4 input tile, 4 bytes per pixel
  158. paramO: minColor - Min 4 byte output color
  159. paramO: maxColor - Max 4 byte output color
  160. ========================
  161. */
  162. ID_INLINE void idDxtEncoder::GetMinMaxBBox_SSE2( const byte * colorBlock, byte * minColor, byte * maxColor ) const {
  163. #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
  164. __asm {
  165. mov eax, colorBlock
  166. mov esi, minColor
  167. mov edi, maxColor
  168. movdqa xmm0, xmmword ptr [eax+ 0]
  169. movdqa xmm1, xmmword ptr [eax+ 0]
  170. pminub xmm0, xmmword ptr [eax+16]
  171. pmaxub xmm1, xmmword ptr [eax+16]
  172. pminub xmm0, xmmword ptr [eax+32]
  173. pmaxub xmm1, xmmword ptr [eax+32]
  174. pminub xmm0, xmmword ptr [eax+48]
  175. pmaxub xmm1, xmmword ptr [eax+48]
  176. pshufd xmm3, xmm0, R_SHUFFLE_D( 2, 3, 2, 3 )
  177. pshufd xmm4, xmm1, R_SHUFFLE_D( 2, 3, 2, 3 )
  178. pminub xmm0, xmm3
  179. pmaxub xmm1, xmm4
  180. pshuflw xmm6, xmm0, R_SHUFFLE_D( 2, 3, 2, 3 )
  181. pshuflw xmm7, xmm1, R_SHUFFLE_D( 2, 3, 2, 3 )
  182. pminub xmm0, xmm6
  183. pmaxub xmm1, xmm7
  184. movd dword ptr [esi], xmm0
  185. movd dword ptr [edi], xmm1
  186. }
  187. #elif defined ( ID_WIN_X86_SSE2_INTRIN )
  188. __m128i block0 = *((__m128i *)(&colorBlock[ 0]));
  189. __m128i block1 = *((__m128i *)(&colorBlock[16]));
  190. __m128i block2 = *((__m128i *)(&colorBlock[32]));
  191. __m128i block3 = *((__m128i *)(&colorBlock[48]));
  192. __m128i max1 = _mm_max_epu8( block0, block1 );
  193. __m128i min1 = _mm_min_epu8( block0, block1 );
  194. __m128i max2 = _mm_max_epu8( block2, block3 );
  195. __m128i min2 = _mm_min_epu8( block2, block3 );
  196. __m128i max3 = _mm_max_epu8( max1, max2 );
  197. __m128i min3 = _mm_min_epu8( min1, min2 );
  198. __m128i max4 = _mm_shuffle_epi32( max3, R_SHUFFLE_D( 2, 3, 2, 3 ) );
  199. __m128i min4 = _mm_shuffle_epi32( min3, R_SHUFFLE_D( 2, 3, 2, 3 ) );
  200. __m128i max5 = _mm_max_epu8( max3, max4 );
  201. __m128i min5 = _mm_min_epu8( min3, min4 );
  202. __m128i max6 = _mm_shufflelo_epi16( max5, R_SHUFFLE_D( 2, 3, 2, 3 ) );
  203. __m128i min6 = _mm_shufflelo_epi16( min5, R_SHUFFLE_D( 2, 3, 2, 3 ) );
  204. max6 = _mm_max_epu8( max5, max6 );
  205. min6 = _mm_min_epu8( min5, min6 );
  206. *((int *)maxColor) = _mm_cvtsi128_si32( max6 );
  207. *((int *)minColor) = _mm_cvtsi128_si32( min6 );
  208. #else
  209. assert( false );
  210. #endif
  211. }
  212. /*
  213. ========================
  214. idDxtEncoder::InsetColorsBBox_SSE2
  215. ========================
  216. */
  217. ID_INLINE void idDxtEncoder::InsetColorsBBox_SSE2( byte * minColor, byte * maxColor ) const {
  218. #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
  219. __asm {
  220. mov esi, minColor
  221. mov edi, maxColor
  222. movd xmm0, dword ptr [esi]
  223. movd xmm1, dword ptr [edi]
  224. punpcklbw xmm0, SIMD_SSE2_byte_0
  225. punpcklbw xmm1, SIMD_SSE2_byte_0
  226. movdqa xmm2, xmm1
  227. psubw xmm2, xmm0
  228. pmulhw xmm2, SIMD_SSE2_word_insetShift
  229. paddw xmm0, xmm2
  230. psubw xmm1, xmm2
  231. packuswb xmm0, xmm0
  232. packuswb xmm1, xmm1
  233. movd dword ptr [esi], xmm0
  234. movd dword ptr [edi], xmm1
  235. }
  236. #elif defined ( ID_WIN_X86_SSE2_INTRIN )
  237. __m128i min = _mm_cvtsi32_si128( *(int *)minColor );
  238. __m128i max = _mm_cvtsi32_si128( *(int *)maxColor );
  239. __m128i xmm0 = _mm_unpacklo_epi8( min, *(__m128i *)SIMD_SSE2_byte_0 );
  240. __m128i xmm1 = _mm_unpacklo_epi8( max, *(__m128i *)SIMD_SSE2_byte_0 );
  241. __m128i xmm2 = _mm_sub_epi16( xmm1, xmm0 );
  242. xmm2 = _mm_mulhi_epi16( xmm2, *(__m128i *)SIMD_SSE2_word_insetShift );
  243. xmm0 = _mm_add_epi16( xmm0, xmm2 );
  244. xmm1 = _mm_sub_epi16( xmm1, xmm2 );
  245. xmm0 = _mm_packus_epi16( xmm0, xmm0 );
  246. xmm1 = _mm_packus_epi16( xmm1, xmm1 );
  247. *((int *)minColor) = _mm_cvtsi128_si32( xmm0 );
  248. *((int *)maxColor) = _mm_cvtsi128_si32( xmm1 );
  249. #else
  250. assert( false );
  251. #endif
  252. }
  253. /*
  254. ========================
  255. idDxtEncoder::EmitColorIndices_SSE2
  256. params: colorBlock - 16 pixel block for which to find color indices
  257. paramO: minColor - Min alpha found
  258. paramO: maxColor - Max alpha found
  259. return: 4 byte color index block
  260. ========================
  261. */
  262. void idDxtEncoder::EmitColorIndices_SSE2( const byte * colorBlock, const byte * minColor_, const byte * maxColor_ ) {
  263. #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
  264. ALIGN16( byte color0[16] );
  265. ALIGN16( byte color1[16] );
  266. ALIGN16( byte color2[16] );
  267. ALIGN16( byte color3[16] );
  268. ALIGN16( byte result[16] );
  269. byte *outPtr = outData;
  270. __asm {
  271. mov esi, maxColor_
  272. mov edi, minColor_
  273. pxor xmm7, xmm7
  274. movdqa result, xmm7
  275. movd xmm0, dword ptr [esi]
  276. pand xmm0, SIMD_SSE2_byte_colorMask
  277. punpcklbw xmm0, xmm7
  278. pshuflw xmm4, xmm0, R_SHUFFLE_D( 0, 3, 2, 3 )
  279. pshuflw xmm5, xmm0, R_SHUFFLE_D( 3, 1, 3, 3 )
  280. psrlw xmm4, 5
  281. psrlw xmm5, 6
  282. por xmm0, xmm4
  283. por xmm0, xmm5
  284. movd xmm1, dword ptr [edi]
  285. pand xmm1, SIMD_SSE2_byte_colorMask
  286. punpcklbw xmm1, xmm7
  287. pshuflw xmm4, xmm1, R_SHUFFLE_D( 0, 3, 2, 3 )
  288. pshuflw xmm5, xmm1, R_SHUFFLE_D( 3, 1, 3, 3 )
  289. psrlw xmm4, 5
  290. psrlw xmm5, 6
  291. por xmm1, xmm4
  292. por xmm1, xmm5
  293. movdqa xmm2, xmm0
  294. packuswb xmm2, xmm7
  295. pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 1, 0, 1 )
  296. movdqa color0, xmm2
  297. movdqa xmm6, xmm0
  298. paddw xmm6, xmm0
  299. paddw xmm6, xmm1
  300. pmulhw xmm6, SIMD_SSE2_word_div_by_3 // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
  301. packuswb xmm6, xmm7
  302. pshufd xmm6, xmm6, R_SHUFFLE_D( 0, 1, 0, 1 )
  303. movdqa color2, xmm6
  304. movdqa xmm3, xmm1
  305. packuswb xmm3, xmm7
  306. pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 1, 0, 1 )
  307. movdqa color1, xmm3
  308. paddw xmm1, xmm1
  309. paddw xmm0, xmm1
  310. pmulhw xmm0, SIMD_SSE2_word_div_by_3 // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
  311. packuswb xmm0, xmm7
  312. pshufd xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 )
  313. movdqa color3, xmm0
  314. mov eax, 32
  315. mov esi, colorBlock
  316. loop1: // iterates 2 times
  317. movq xmm3, qword ptr [esi+eax+0]
  318. pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 ) // punpckldq xmm4, SIMD_SSE2_dword_0
  319. movq xmm5, qword ptr [esi+eax+8]
  320. pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 ) // punpckldq xmm5, SIMD_SSE2_dword_0
  321. movdqa xmm0, xmm3
  322. movdqa xmm6, xmm5
  323. psadbw xmm0, color0
  324. psadbw xmm6, color0
  325. packssdw xmm0, xmm6
  326. movdqa xmm1, xmm3
  327. movdqa xmm6, xmm5
  328. psadbw xmm1, color1
  329. psadbw xmm6, color1
  330. packssdw xmm1, xmm6
  331. movdqa xmm2, xmm3
  332. movdqa xmm6, xmm5
  333. psadbw xmm2, color2
  334. psadbw xmm6, color2
  335. packssdw xmm2, xmm6
  336. psadbw xmm3, color3
  337. psadbw xmm5, color3
  338. packssdw xmm3, xmm5
  339. movq xmm4, qword ptr [esi+eax+16]
  340. pshufd xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 )
  341. movq xmm5, qword ptr [esi+eax+24]
  342. pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )
  343. movdqa xmm6, xmm4
  344. movdqa xmm7, xmm5
  345. psadbw xmm6, color0
  346. psadbw xmm7, color0
  347. packssdw xmm6, xmm7
  348. packssdw xmm0, xmm6 // d1
  349. movdqa xmm6, xmm4
  350. movdqa xmm7, xmm5
  351. psadbw xmm6, color1
  352. psadbw xmm7, color1
  353. packssdw xmm6, xmm7
  354. packssdw xmm1, xmm6 // d1
  355. movdqa xmm6, xmm4
  356. movdqa xmm7, xmm5
  357. psadbw xmm6, color2
  358. psadbw xmm7, color2
  359. packssdw xmm6, xmm7
  360. packssdw xmm2, xmm6 // d2
  361. psadbw xmm4, color3
  362. psadbw xmm5, color3
  363. packssdw xmm4, xmm5
  364. packssdw xmm3, xmm4 // d3
  365. movdqa xmm7, result
  366. pslld xmm7, 16
  367. movdqa xmm4, xmm0
  368. movdqa xmm5, xmm1
  369. pcmpgtw xmm0, xmm3 // b0
  370. pcmpgtw xmm1, xmm2 // b1
  371. pcmpgtw xmm4, xmm2 // b2
  372. pcmpgtw xmm5, xmm3 // b3
  373. pcmpgtw xmm2, xmm3 // b4
  374. pand xmm4, xmm1 // x0
  375. pand xmm5, xmm0 // x1
  376. pand xmm2, xmm0 // x2
  377. por xmm4, xmm5
  378. pand xmm2, SIMD_SSE2_word_1
  379. pand xmm4, SIMD_SSE2_word_2
  380. por xmm2, xmm4
  381. pshufd xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 )
  382. punpcklwd xmm2, SIMD_SSE2_word_0
  383. punpcklwd xmm5, SIMD_SSE2_word_0
  384. pslld xmm5, 8
  385. por xmm7, xmm5
  386. por xmm7, xmm2
  387. movdqa result, xmm7
  388. sub eax, 32
  389. jge loop1
  390. mov esi, outPtr
  391. pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 )
  392. pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 )
  393. pshufd xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 )
  394. pslld xmm4, 2
  395. pslld xmm5, 4
  396. pslld xmm6, 6
  397. por xmm7, xmm4
  398. por xmm7, xmm5
  399. por xmm7, xmm6
  400. movd dword ptr [esi], xmm7
  401. }
  402. outData += 4;
  403. #elif defined ( ID_WIN_X86_SSE2_INTRIN )
  404. __m128c zero = SIMD_SSE2_zero;
  405. __m128c result = SIMD_SSE2_zero;
  406. __m128c color0, color1, color2, color3;
  407. __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  408. __m128c minColor = _mm_cvtsi32_si128( *(int *)minColor_ );
  409. __m128c maxColor = _mm_cvtsi32_si128( *(int *)maxColor_ );
  410. __m128c blocka[2], blockb[2];
  411. blocka[0] = *((__m128i *)(&colorBlock[ 0]));
  412. blocka[1] = *((__m128i *)(&colorBlock[32]));
  413. blockb[0] = *((__m128i *)(&colorBlock[16]));
  414. blockb[1] = *((__m128i *)(&colorBlock[48]));
  415. temp0 = _mm_and_si128( maxColor, (const __m128i &)SIMD_SSE2_byte_colorMask );
  416. temp0 = _mm_unpacklo_epi8( temp0, zero );
  417. temp4 = _mm_shufflelo_epi16( temp0, R_SHUFFLE_D( 0, 3, 2, 3 ) );
  418. temp5 = _mm_shufflelo_epi16( temp0, R_SHUFFLE_D( 3, 1, 3, 3 ) );
  419. temp4 = _mm_srli_epi16( temp4, 5 );
  420. temp5 = _mm_srli_epi16( temp5, 6 );
  421. temp0 = _mm_or_si128( temp0, temp4 );
  422. temp0 = _mm_or_si128( temp0, temp5 );
  423. temp1 = _mm_and_si128( minColor, (const __m128i &)SIMD_SSE2_byte_colorMask );
  424. temp1 = _mm_unpacklo_epi8( temp1, zero );
  425. temp4 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 0, 3, 2, 3 ) );
  426. temp5 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 3, 1, 3, 3 ) );
  427. temp4 = _mm_srli_epi16( temp4, 5 );
  428. temp5 = _mm_srli_epi16( temp5, 6 );
  429. temp1 = _mm_or_si128( temp1, temp4 );
  430. temp1 = _mm_or_si128( temp1, temp5 );
  431. temp2 = _mm_packus_epi16( temp0, zero );
  432. color0 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 1, 0, 1 ) );
  433. temp6 = _mm_add_epi16( temp0, temp0 );
  434. temp6 = _mm_add_epi16( temp6, temp1 );
  435. temp6 = _mm_mulhi_epi16( temp6, (const __m128i &)SIMD_SSE2_word_div_by_3 ); // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
  436. temp6 = _mm_packus_epi16( temp6, zero );
  437. color2 = _mm_shuffle_epi32( temp6, R_SHUFFLE_D( 0, 1, 0, 1 ) );
  438. temp3 = _mm_packus_epi16( temp1, zero );
  439. color1 = _mm_shuffle_epi32( temp3, R_SHUFFLE_D( 0, 1, 0, 1 ) );
  440. temp1 = _mm_add_epi16( temp1, temp1 );
  441. temp0 = _mm_add_epi16( temp0, temp1 );
  442. temp0 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_div_by_3 ); // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
  443. temp0 = _mm_packus_epi16( temp0, zero );
  444. color3 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 0, 1, 0, 1 ) );
  445. for ( int i = 1; i >= 0; i-- ) {
  446. // Load block
  447. temp3 = _mm_shuffle_epi32( blocka[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
  448. temp5 = _mm_shuffle_ps( blocka[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
  449. temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
  450. temp0 = _mm_sad_epu8( temp3, color0 );
  451. temp6 = _mm_sad_epu8( temp5, color0 );
  452. temp0 = _mm_packs_epi32( temp0, temp6 );
  453. temp1 = _mm_sad_epu8( temp3, color1 );
  454. temp6 = _mm_sad_epu8( temp5, color1 );
  455. temp1 = _mm_packs_epi32( temp1, temp6 );
  456. temp2 = _mm_sad_epu8( temp3, color2 );
  457. temp6 = _mm_sad_epu8( temp5, color2 );
  458. temp2 = _mm_packs_epi32( temp2, temp6 );
  459. temp3 = _mm_sad_epu8( temp3, color3 );
  460. temp5 = _mm_sad_epu8( temp5, color3 );
  461. temp3 = _mm_packs_epi32( temp3, temp5 );
  462. // Load block
  463. temp4 = _mm_shuffle_epi32( blockb[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
  464. temp5 = _mm_shuffle_ps( blockb[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
  465. temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
  466. temp6 = _mm_sad_epu8( temp4, color0 );
  467. temp7 = _mm_sad_epu8( temp5, color0 );
  468. temp6 = _mm_packs_epi32( temp6, temp7 );
  469. temp0 = _mm_packs_epi32( temp0, temp6 ); // d0
  470. temp6 = _mm_sad_epu8( temp4, color1 );
  471. temp7 = _mm_sad_epu8( temp5, color1 );
  472. temp6 = _mm_packs_epi32( temp6, temp7 );
  473. temp1 = _mm_packs_epi32( temp1, temp6 ); // d1
  474. temp6 = _mm_sad_epu8( temp4, color2 );
  475. temp7 = _mm_sad_epu8( temp5, color2 );
  476. temp6 = _mm_packs_epi32( temp6, temp7 );
  477. temp2 = _mm_packs_epi32( temp2, temp6 ); // d2
  478. temp4 = _mm_sad_epu8( temp4, color3 );
  479. temp5 = _mm_sad_epu8( temp5, color3 );
  480. temp4 = _mm_packs_epi32( temp4, temp5 );
  481. temp3 = _mm_packs_epi32( temp3, temp4 ); // d3
  482. temp7 = _mm_slli_epi32( result, 16 );
  483. temp4 = _mm_cmpgt_epi16( temp0, temp2 ); // b2
  484. temp5 = _mm_cmpgt_epi16( temp1, temp3 ); // b3
  485. temp0 = _mm_cmpgt_epi16( temp0, temp3 ); // b0
  486. temp1 = _mm_cmpgt_epi16( temp1, temp2 ); // b1
  487. temp2 = _mm_cmpgt_epi16( temp2, temp3 ); // b4
  488. temp4 = _mm_and_si128( temp4, temp1 ); // x0
  489. temp5 = _mm_and_si128( temp5, temp0 ); // x1
  490. temp2 = _mm_and_si128( temp2, temp0 ); // x2
  491. temp4 = _mm_or_si128( temp4, temp5 );
  492. temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_1 );
  493. temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_word_2 );
  494. temp2 = _mm_or_si128( temp2, temp4 );
  495. temp5 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 2, 3, 0, 1 ) );
  496. temp2 = _mm_unpacklo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_0 );
  497. temp5 = _mm_unpacklo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_0 );
  498. temp5 = _mm_slli_epi32( temp5, 8 );
  499. temp7 = _mm_or_si128( temp7, temp5 );
  500. result = _mm_or_si128( temp7, temp2 );
  501. }
  502. temp4 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 1, 2, 3, 0 ) );
  503. temp5 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 2, 3, 0, 1 ) );
  504. temp6 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 3, 0, 1, 2 ) );
  505. temp4 = _mm_slli_epi32( temp4, 2 );
  506. temp5 = _mm_slli_epi32( temp5, 4 );
  507. temp6 = _mm_slli_epi32( temp6, 6 );
  508. temp7 = _mm_or_si128( result, temp4 );
  509. temp7 = _mm_or_si128( temp7, temp5 );
  510. temp7 = _mm_or_si128( temp7, temp6 );
  511. unsigned int out = _mm_cvtsi128_si32( temp7 );
  512. EmitUInt( out );
  513. #else
  514. assert( false );
  515. #endif
  516. }
  517. /*
  518. ========================
  519. idDxtEncoder::EmitColorAlphaIndices_SSE2
  520. params: colorBlock - 16 pixel block for which find color indexes
  521. paramO: minColor - Min color found
  522. paramO: maxColor - Max color found
  523. return: 4 byte color index block
  524. ========================
  525. */
  526. void idDxtEncoder::EmitColorAlphaIndices_SSE2( const byte *colorBlock, const byte *minColor_, const byte *maxColor_ ) {
  527. #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
  528. ALIGN16( byte color0[16] );
  529. ALIGN16( byte color1[16] );
  530. ALIGN16( byte color2[16] );
  531. ALIGN16( byte color3[16] );
  532. ALIGN16( byte result[16] );
  533. byte *outPtr = outData;
  534. __asm {
  535. mov esi, maxColor_
  536. mov edi, minColor_
  537. pxor xmm7, xmm7
  538. movdqa result, xmm7
  539. movd xmm0, dword ptr [esi]
  540. pand xmm0, SIMD_SSE2_byte_colorMask
  541. punpcklbw xmm0, xmm7
  542. pshuflw xmm4, xmm0, R_SHUFFLE_D( 0, 3, 2, 3 )
  543. pshuflw xmm5, xmm0, R_SHUFFLE_D( 3, 1, 3, 3 )
  544. psrlw xmm4, 5
  545. psrlw xmm5, 6
  546. por xmm0, xmm4
  547. por xmm0, xmm5
  548. movd xmm1, dword ptr [edi]
  549. pand xmm1, SIMD_SSE2_byte_colorMask
  550. punpcklbw xmm1, xmm7
  551. pshuflw xmm4, xmm1, R_SHUFFLE_D( 0, 3, 2, 3 )
  552. pshuflw xmm5, xmm1, R_SHUFFLE_D( 3, 1, 3, 3 )
  553. psrlw xmm4, 5
  554. psrlw xmm5, 6
  555. por xmm1, xmm4
  556. por xmm1, xmm5
  557. movdqa xmm2, xmm0
  558. packuswb xmm2, xmm7
  559. pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 1, 0, 1 )
  560. movdqa color0, xmm2
  561. movdqa xmm6, xmm0
  562. paddw xmm6, xmm1
  563. psrlw xmm6, 1
  564. packuswb xmm6, xmm7
  565. pshufd xmm6, xmm6, R_SHUFFLE_D( 0, 1, 0, 1 )
  566. movdqa color2, xmm6
  567. movdqa xmm3, xmm1
  568. packuswb xmm3, xmm7
  569. pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 1, 0, 1 )
  570. movdqa color1, xmm3
  571. movdqa color3, xmm7
  572. mov eax, 32
  573. mov esi, colorBlock
  574. loop1: // iterates 2 times
  575. movq xmm3, qword ptr [esi+eax+0]
  576. pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 )
  577. movq xmm5, qword ptr [esi+eax+8]
  578. pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )
  579. movdqa xmm0, xmm3
  580. movdqa xmm6, xmm5
  581. psadbw xmm0, color0
  582. psadbw xmm6, color0
  583. packssdw xmm0, xmm6
  584. movdqa xmm1, xmm3
  585. movdqa xmm6, xmm5
  586. psadbw xmm1, color1
  587. psadbw xmm6, color1
  588. packssdw xmm1, xmm6
  589. movdqa xmm2, xmm3
  590. movdqa xmm6, xmm5
  591. psadbw xmm2, color2
  592. psadbw xmm6, color2
  593. packssdw xmm2, xmm6
  594. shufps xmm3, xmm5, R_SHUFFLE_D( 0, 2, 0, 2 )
  595. psrld xmm3, 24
  596. packssdw xmm3, xmm3
  597. movq xmm4, qword ptr [esi+eax+16]
  598. pshufd xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 )
  599. movq xmm5, qword ptr [esi+eax+24]
  600. pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )
  601. movdqa xmm6, xmm4
  602. movdqa xmm7, xmm5
  603. psadbw xmm6, color0
  604. psadbw xmm7, color0
  605. packssdw xmm6, xmm7
  606. packssdw xmm0, xmm6 // d1
  607. movdqa xmm6, xmm4
  608. movdqa xmm7, xmm5
  609. psadbw xmm6, color1
  610. psadbw xmm7, color1
  611. packssdw xmm6, xmm7
  612. packssdw xmm1, xmm6 // d1
  613. movdqa xmm6, xmm4
  614. movdqa xmm7, xmm5
  615. psadbw xmm6, color2
  616. psadbw xmm7, color2
  617. packssdw xmm6, xmm7
  618. packssdw xmm2, xmm6 // d2
  619. shufps xmm4, xmm5, R_SHUFFLE_D( 0, 2, 0, 2 )
  620. psrld xmm4, 24
  621. packssdw xmm4, xmm4
  622. punpcklqdq xmm3, xmm4 // c3
  623. movdqa xmm7, result
  624. pslld xmm7, 16
  625. movdqa xmm4, xmm2
  626. pcmpgtw xmm2, xmm0 // b0
  627. pcmpgtw xmm4, xmm1 // b1
  628. pcmpgtw xmm1, xmm0 // b2
  629. pmaxsw xmm3, SIMD_SSE2_word_127 // b3
  630. pcmpeqw xmm3, SIMD_SSE2_word_127
  631. pand xmm2, xmm4
  632. por xmm2, xmm3 // b0 & b1 | b3
  633. pxor xmm1, xmm4
  634. por xmm1, xmm3 // b2 ^ b1 | b3
  635. pand xmm2, SIMD_SSE2_word_2
  636. pand xmm1, SIMD_SSE2_word_1
  637. por xmm2, xmm1
  638. pshufd xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 )
  639. punpcklwd xmm2, SIMD_SSE2_word_0
  640. punpcklwd xmm5, SIMD_SSE2_word_0
  641. pslld xmm5, 8
  642. por xmm7, xmm5
  643. por xmm7, xmm2
  644. movdqa result, xmm7
  645. sub eax, 32
  646. jge loop1
  647. mov esi, outPtr
  648. pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 )
  649. pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 )
  650. pshufd xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 )
  651. pslld xmm4, 2
  652. pslld xmm5, 4
  653. pslld xmm6, 6
  654. por xmm7, xmm4
  655. por xmm7, xmm5
  656. por xmm7, xmm6
  657. movd dword ptr [esi], xmm7
  658. }
  659. outData += 4;
  660. #elif defined ( ID_WIN_X86_SSE2_INTRIN )
  661. __m128c zero = SIMD_SSE2_zero;
  662. __m128c result = SIMD_SSE2_zero;
  663. __m128c color0, color1, color2;
  664. __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  665. __m128c minColor = _mm_cvtsi32_si128( *(int *)minColor_ );
  666. __m128c maxColor = _mm_cvtsi32_si128( *(int *)maxColor_ );
  667. __m128c blocka[2], blockb[2];
  668. blocka[0] = *((__m128i *)(&colorBlock[ 0]));
  669. blocka[1] = *((__m128i *)(&colorBlock[32]));
  670. blockb[0] = *((__m128i *)(&colorBlock[16]));
  671. blockb[1] = *((__m128i *)(&colorBlock[48]));
  672. temp0 = _mm_and_si128( maxColor, *(__m128c*)SIMD_SSE2_byte_colorMask );
  673. temp0 = _mm_unpacklo_epi8( temp0, zero );
  674. temp4 = _mm_shufflelo_epi16( temp0, R_SHUFFLE_D( 0, 3, 2, 3 ) );
  675. temp5 = _mm_shufflelo_epi16( temp0, R_SHUFFLE_D( 3, 1, 3, 3 ) );
  676. temp4 = _mm_srli_epi16( temp4, 5 );
  677. temp5 = _mm_srli_epi16( temp5, 6 );
  678. temp0 = _mm_or_si128( temp0, temp4 );
  679. temp0 = _mm_or_si128( temp0, temp5 );
  680. temp1 = _mm_and_si128( minColor, *(__m128c*)SIMD_SSE2_byte_colorMask );
  681. temp1 = _mm_unpacklo_epi8( temp1, zero );
  682. temp4 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 0, 3, 2, 3 ) );
  683. temp5 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 3, 1, 3, 3 ) );
  684. temp4 = _mm_srli_epi16( temp4, 5 );
  685. temp5 = _mm_srli_epi16( temp5, 6 );
  686. temp1 = _mm_or_si128( temp1, temp4 );
  687. temp1 = _mm_or_si128( temp1, temp5 );
  688. temp2 = _mm_packus_epi16( temp0, zero );
  689. color0 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 1, 0, 1 ) );
  690. temp6 = _mm_add_epi16( temp0, temp0 );
  691. temp6 = _mm_srli_epi16( temp6, 1 ); // diff from color
  692. temp6 = _mm_packus_epi16( temp6, zero );
  693. color2 = _mm_shuffle_epi32( temp6, R_SHUFFLE_D( 0, 1, 0, 1 ) );
  694. temp3 = _mm_packus_epi16( temp1, zero );
  695. color1 = _mm_shuffle_epi32( temp3, R_SHUFFLE_D( 0, 1, 0, 1 ) );
  696. // not used
  697. //color3 = zero;
  698. for ( int i = 1; i >= 0; i-- ) {
  699. // Load block
  700. temp3 = _mm_shuffle_epi32( blocka[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
  701. temp5 = _mm_shuffle_ps( blocka[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
  702. temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
  703. temp0 = _mm_sad_epu8( temp3, color0 );
  704. temp6 = _mm_sad_epu8( temp5, color0 );
  705. temp0 = _mm_packs_epi32( temp0, temp6 );
  706. temp1 = _mm_sad_epu8( temp3, color1 );
  707. temp6 = _mm_sad_epu8( temp5, color1 );
  708. temp1 = _mm_packs_epi32( temp1, temp6 );
  709. temp2 = _mm_sad_epu8( temp3, color2 );
  710. temp6 = _mm_sad_epu8( temp5, color2 );
  711. temp2 = _mm_packs_epi32( temp2, temp6 );
  712. // diff from color
  713. temp3 = _mm_shuffle_ps( temp3, temp5, R_SHUFFLE_D( 0, 2, 0, 2 ) );
  714. temp3 = _mm_srli_epi32( temp3, 24 );
  715. temp3 = _mm_packs_epi32( temp3, temp3 );
  716. // Load block
  717. temp4 = _mm_shuffle_epi32( blockb[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
  718. temp5 = _mm_shuffle_ps( blockb[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
  719. temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
  720. temp6 = _mm_sad_epu8( temp4, color0 );
  721. temp7 = _mm_sad_epu8( temp5, color0 );
  722. temp6 = _mm_packs_epi32( temp6, temp7 );
  723. temp0 = _mm_packs_epi32( temp0, temp6 ); // d0
  724. temp6 = _mm_sad_epu8( temp4, color1 );
  725. temp7 = _mm_sad_epu8( temp5, color1 );
  726. temp6 = _mm_packs_epi32( temp6, temp7 );
  727. temp1 = _mm_packs_epi32( temp1, temp6 ); // d1
  728. temp6 = _mm_sad_epu8( temp4, color2 );
  729. temp7 = _mm_sad_epu8( temp5, color2 );
  730. temp6 = _mm_packs_epi32( temp6, temp7 );
  731. temp2 = _mm_packs_epi32( temp2, temp6 ); // d2
  732. // diff from color
  733. temp4 = _mm_shuffle_ps( temp4, temp5, R_SHUFFLE_D( 0, 2, 0, 2 ) ); // c3
  734. temp4 = _mm_srli_epi32( temp4, 24 );
  735. temp4 = _mm_packs_epi32( temp4, temp4 );
  736. temp3 = _mm_unpacklo_epi64( temp3, temp4 );
  737. temp7 = _mm_slli_epi32( result, 16 );
  738. // diff from color
  739. temp4 = _mm_cmpgt_epi16( temp2, temp1 ); // b1
  740. temp2 = _mm_cmpgt_epi16( temp2, temp0 ); // b0
  741. temp1 = _mm_cmpgt_epi16( temp1, temp0 ); // b2
  742. temp3 = _mm_max_epi16( temp3, (const __m128i &)SIMD_SSE2_word_127 ); // b3
  743. temp3 = _mm_cmpeq_epi16( temp3, (const __m128i &)SIMD_SSE2_word_127 );
  744. temp2 = _mm_and_si128( temp2, temp4 );
  745. temp2 = _mm_or_si128( temp2, temp3 ); // b0 & b1 | b3
  746. temp1 = _mm_xor_si128( temp1, temp4 );
  747. temp1 = _mm_or_si128( temp1, temp3 ); // b2 ^ b1 | b3
  748. temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_2 );
  749. temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_word_1 );
  750. temp2 = _mm_or_si128( temp2, temp1 );
  751. temp5 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 2, 3, 0, 1 ) );
  752. temp2 = _mm_unpacklo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_0 );
  753. temp5 = _mm_unpacklo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_0 );
  754. temp5 = _mm_slli_epi32( temp5, 8 );
  755. temp7 = _mm_or_si128( temp7, temp5 );
  756. result = _mm_or_si128( temp7, temp2 );
  757. }
  758. temp4 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 1, 2, 3, 0 ) );
  759. temp5 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 2, 3, 0, 1 ) );
  760. temp6 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 3, 0, 1, 2 ) );
  761. temp4 = _mm_slli_epi32( temp4, 2 );
  762. temp5 = _mm_slli_epi32( temp5, 4 );
  763. temp6 = _mm_slli_epi32( temp6, 6 );
  764. temp7 = _mm_or_si128( result, temp4 );
  765. temp7 = _mm_or_si128( temp7, temp5 );
  766. temp7 = _mm_or_si128( temp7, temp6 );
  767. unsigned int out = _mm_cvtsi128_si32( temp7 );
  768. EmitUInt( out );
  769. #else
  770. assert( false );
  771. #endif
  772. }
  773. /*
  774. ========================
  775. idDxtEncoder::EmitCoCgIndices_SSE2
  776. params: colorBlock - 16 pixel block for which to find color indices
  777. paramO: minColor - Min alpha found
  778. paramO: maxColor - Max alpha found
  779. return: 4 byte color index block
  780. ========================
  781. */
  782. void idDxtEncoder::EmitCoCgIndices_SSE2( const byte *colorBlock, const byte *minColor_, const byte *maxColor_ ) {
  783. #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
  784. ALIGN16( byte color0[16] );
  785. ALIGN16( byte color1[16] );
  786. ALIGN16( byte color2[16] );
  787. ALIGN16( byte color3[16] );
  788. ALIGN16( byte result[16] );
  789. byte *outPtr = outData;
  790. __asm {
  791. mov esi, maxColor_
  792. mov edi, minColor_
  793. pxor xmm7, xmm7
  794. movdqa result, xmm7
  795. movd xmm0, dword ptr [esi]
  796. pand xmm0, SIMD_SSE2_byte_colorMask2
  797. pshufd xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 )
  798. movdqa color0, xmm0
  799. movd xmm1, dword ptr [edi]
  800. pand xmm1, SIMD_SSE2_byte_colorMask2
  801. pshufd xmm1, xmm1, R_SHUFFLE_D( 0, 1, 0, 1 )
  802. movdqa color1, xmm1
  803. punpcklbw xmm0, xmm7
  804. punpcklbw xmm1, xmm7
  805. movdqa xmm6, xmm1
  806. paddw xmm1, xmm0
  807. paddw xmm0, xmm1
  808. pmulhw xmm0, SIMD_SSE2_word_div_by_3 // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
  809. packuswb xmm0, xmm7
  810. pshufd xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 )
  811. movdqa color2, xmm0
  812. paddw xmm1, xmm6
  813. pmulhw xmm1, SIMD_SSE2_word_div_by_3 // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
  814. packuswb xmm1, xmm7
  815. pshufd xmm1, xmm1, R_SHUFFLE_D( 0, 1, 0, 1 )
  816. movdqa color3, xmm1
  817. mov eax, 32
  818. mov esi, colorBlock
  819. loop1: // iterates 2 times
  820. movq xmm3, qword ptr [esi+eax+0]
  821. pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 ) // punpckldq xmm4, SIMD_SSE2_dword_0
  822. movq xmm5, qword ptr [esi+eax+8]
  823. pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 ) // punpckldq xmm5, SIMD_SSE2_dword_0
  824. movdqa xmm0, xmm3
  825. movdqa xmm6, xmm5
  826. psadbw xmm0, color0
  827. psadbw xmm6, color0
  828. packssdw xmm0, xmm6
  829. movdqa xmm1, xmm3
  830. movdqa xmm6, xmm5
  831. psadbw xmm1, color1
  832. psadbw xmm6, color1
  833. packssdw xmm1, xmm6
  834. movdqa xmm2, xmm3
  835. movdqa xmm6, xmm5
  836. psadbw xmm2, color2
  837. psadbw xmm6, color2
  838. packssdw xmm2, xmm6
  839. psadbw xmm3, color3
  840. psadbw xmm5, color3
  841. packssdw xmm3, xmm5
  842. movq xmm4, qword ptr [esi+eax+16]
  843. pshufd xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 )
  844. movq xmm5, qword ptr [esi+eax+24]
  845. pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )
  846. movdqa xmm6, xmm4
  847. movdqa xmm7, xmm5
  848. psadbw xmm6, color0
  849. psadbw xmm7, color0
  850. packssdw xmm6, xmm7
  851. packssdw xmm0, xmm6 // d1
  852. movdqa xmm6, xmm4
  853. movdqa xmm7, xmm5
  854. psadbw xmm6, color1
  855. psadbw xmm7, color1
  856. packssdw xmm6, xmm7
  857. packssdw xmm1, xmm6 // d1
  858. movdqa xmm6, xmm4
  859. movdqa xmm7, xmm5
  860. psadbw xmm6, color2
  861. psadbw xmm7, color2
  862. packssdw xmm6, xmm7
  863. packssdw xmm2, xmm6 // d2
  864. psadbw xmm4, color3
  865. psadbw xmm5, color3
  866. packssdw xmm4, xmm5
  867. packssdw xmm3, xmm4 // d3
  868. movdqa xmm7, result
  869. pslld xmm7, 16
  870. movdqa xmm4, xmm0
  871. movdqa xmm5, xmm1
  872. pcmpgtw xmm0, xmm3 // b0
  873. pcmpgtw xmm1, xmm2 // b1
  874. pcmpgtw xmm4, xmm2 // b2
  875. pcmpgtw xmm5, xmm3 // b3
  876. pcmpgtw xmm2, xmm3 // b4
  877. pand xmm4, xmm1 // x0
  878. pand xmm5, xmm0 // x1
  879. pand xmm2, xmm0 // x2
  880. por xmm4, xmm5
  881. pand xmm2, SIMD_SSE2_word_1
  882. pand xmm4, SIMD_SSE2_word_2
  883. por xmm2, xmm4
  884. pshufd xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 )
  885. punpcklwd xmm2, SIMD_SSE2_word_0
  886. punpcklwd xmm5, SIMD_SSE2_word_0
  887. pslld xmm5, 8
  888. por xmm7, xmm5
  889. por xmm7, xmm2
  890. movdqa result, xmm7
  891. sub eax, 32
  892. jge loop1
  893. mov esi, outPtr
  894. pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 )
  895. pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 )
  896. pshufd xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 )
  897. pslld xmm4, 2
  898. pslld xmm5, 4
  899. pslld xmm6, 6
  900. por xmm7, xmm4
  901. por xmm7, xmm5
  902. por xmm7, xmm6
  903. movd dword ptr [esi], xmm7
  904. }
  905. outData += 4;
  906. #elif defined ( ID_WIN_X86_SSE2_INTRIN )
  907. __m128c zero = SIMD_SSE2_zero;
  908. __m128c result = SIMD_SSE2_zero;
  909. __m128c color0, color1, color2, color3;
  910. __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  911. __m128c minColor = _mm_cvtsi32_si128( *(int *)minColor_ );
  912. __m128c maxColor = _mm_cvtsi32_si128( *(int *)maxColor_ );
  913. __m128c blocka[2], blockb[2];
  914. blocka[0] = *((__m128i *)(&colorBlock[ 0]));
  915. blocka[1] = *((__m128i *)(&colorBlock[32]));
  916. blockb[0] = *((__m128i *)(&colorBlock[16]));
  917. blockb[1] = *((__m128i *)(&colorBlock[48]));
  918. temp7 = zero;
  919. temp0 = maxColor;
  920. temp0 = _mm_and_si128( temp0, *(__m128c*)SIMD_SSE2_byte_colorMask2 );
  921. color0 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 0, 1, 0, 1 ) );
  922. temp1 = minColor;
  923. temp1 = _mm_and_si128( temp1, *(__m128c*)SIMD_SSE2_byte_colorMask2 );
  924. color1 = _mm_shuffle_epi32( temp1, R_SHUFFLE_D( 0, 1, 0, 1 ) );
  925. temp0 = _mm_unpacklo_epi8( color0, zero );
  926. temp1 = _mm_unpacklo_epi8( color1, zero );
  927. temp6 = _mm_add_epi16( temp1, temp0 );
  928. temp0 = _mm_add_epi16( temp0, temp6 );
  929. temp0 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_div_by_3 ); // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
  930. temp0 = _mm_packus_epi16( temp0, zero );
  931. color2 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 0, 1, 0, 1 ) );
  932. temp1 = _mm_add_epi16( temp1, temp6 );
  933. temp1 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_div_by_3 ); // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
  934. temp1 = _mm_packus_epi16( temp1, zero );
  935. color3 = _mm_shuffle_epi32( temp1, R_SHUFFLE_D( 0, 1, 0, 1 ) );
  936. for ( int i = 1; i >= 0; i-- ) {
  937. // Load block
  938. temp3 = _mm_shuffle_epi32( blocka[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
  939. temp5 = _mm_shuffle_ps( blocka[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
  940. temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
  941. temp0 = _mm_sad_epu8( temp3, color0 );
  942. temp6 = _mm_sad_epu8( temp5, color0 );
  943. temp0 = _mm_packs_epi32( temp0, temp6 );
  944. temp1 = _mm_sad_epu8( temp3, color1 );
  945. temp6 = _mm_sad_epu8( temp5, color1 );
  946. temp1 = _mm_packs_epi32( temp1, temp6 );
  947. temp2 = _mm_sad_epu8( temp3, color2 );
  948. temp6 = _mm_sad_epu8( temp5, color2 );
  949. temp2 = _mm_packs_epi32( temp2, temp6 );
  950. temp3 = _mm_sad_epu8( temp3, color3 );
  951. temp5 = _mm_sad_epu8( temp5, color3 );
  952. temp3 = _mm_packs_epi32( temp3, temp5 );
  953. // Load block
  954. temp4 = _mm_shuffle_epi32( blockb[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
  955. temp5 = _mm_shuffle_ps( blockb[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
  956. temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
  957. temp6 = _mm_sad_epu8( temp4, color0 );
  958. temp7 = _mm_sad_epu8( temp5, color0 );
  959. temp6 = _mm_packs_epi32( temp6, temp7 );
  960. temp0 = _mm_packs_epi32( temp0, temp6 ); // d0
  961. temp6 = _mm_sad_epu8( temp4, color1 );
  962. temp7 = _mm_sad_epu8( temp5, color1 );
  963. temp6 = _mm_packs_epi32( temp6, temp7 );
  964. temp1 = _mm_packs_epi32( temp1, temp6 ); // d1
  965. temp6 = _mm_sad_epu8( temp4, color2 );
  966. temp7 = _mm_sad_epu8( temp5, color2 );
  967. temp6 = _mm_packs_epi32( temp6, temp7 );
  968. temp2 = _mm_packs_epi32( temp2, temp6 ); // d2
  969. temp4 = _mm_sad_epu8( temp4, color3 );
  970. temp5 = _mm_sad_epu8( temp5, color3 );
  971. temp4 = _mm_packs_epi32( temp4, temp5 );
  972. temp3 = _mm_packs_epi32( temp3, temp4 ); // d3
  973. temp7 = _mm_slli_epi32( result, 16 );
  974. temp4 = _mm_cmpgt_epi16( temp0, temp2 ); // b2
  975. temp5 = _mm_cmpgt_epi16( temp1, temp3 ); // b3
  976. temp0 = _mm_cmpgt_epi16( temp0, temp3 ); // b0
  977. temp1 = _mm_cmpgt_epi16( temp1, temp2 ); // b1
  978. temp2 = _mm_cmpgt_epi16( temp2, temp3 ); // b4
  979. temp4 = _mm_and_si128( temp4, temp1 ); // x0
  980. temp5 = _mm_and_si128( temp5, temp0 ); // x1
  981. temp2 = _mm_and_si128( temp2, temp0 ); // x2
  982. temp4 = _mm_or_si128( temp4, temp5 );
  983. temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_1 );
  984. temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_word_2 );
  985. temp2 = _mm_or_si128( temp2, temp4 );
  986. temp5 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 2, 3, 0, 1 ) );
  987. temp2 = _mm_unpacklo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_0 );
  988. temp5 = _mm_unpacklo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_0 );
  989. temp5 = _mm_slli_epi32( temp5, 8 );
  990. temp7 = _mm_or_si128( temp7, temp5 );
  991. result = _mm_or_si128( temp7, temp2 );
  992. }
  993. temp4 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 1, 2, 3, 0 ) );
  994. temp5 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 2, 3, 0, 1 ) );
  995. temp6 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 3, 0, 1, 2 ) );
  996. temp4 = _mm_slli_epi32( temp4, 2 );
  997. temp5 = _mm_slli_epi32( temp5, 4 );
  998. temp6 = _mm_slli_epi32( temp6, 6 );
  999. temp7 = _mm_or_si128( result, temp4 );
  1000. temp7 = _mm_or_si128( temp7, temp5 );
  1001. temp7 = _mm_or_si128( temp7, temp6 );
  1002. unsigned int out = _mm_cvtsi128_si32( temp7 );
  1003. EmitUInt( out );
  1004. #else
  1005. assert( false );
  1006. #endif
  1007. }
  1008. /*
  1009. ========================
  1010. idDxtEncoder::EmitAlphaIndices_SSE2
  1011. params: block - 16 pixel block for which to find alpha indices
  1012. paramO: minAlpha - Min alpha found
  1013. paramO: maxAlpha - Max alpha found
  1014. ========================
  1015. */
  1016. void idDxtEncoder::EmitAlphaIndices_SSE2( const byte *block, const int minAlpha_, const int maxAlpha_ ) {
  1017. #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
  1018. assert( maxAlpha_ >= minAlpha_ );
  1019. byte *outPtr = outData;
  1020. __asm {
  1021. mov esi, block
  1022. movdqa xmm0, xmmword ptr [esi+ 0]
  1023. movdqa xmm5, xmmword ptr [esi+16]
  1024. movdqa xmm6, xmmword ptr [esi+32]
  1025. movdqa xmm4, xmmword ptr [esi+48]
  1026. psrld xmm0, 24
  1027. psrld xmm5, 24
  1028. psrld xmm6, 24
  1029. psrld xmm4, 24
  1030. packuswb xmm0, xmm5
  1031. packuswb xmm6, xmm4
  1032. //---------------------
  1033. // ab0 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
  1034. // ab3 = ( 9 * maxAlpha + 5 * minAlpha + ALPHA_RANGE ) / 14
  1035. // ab2 = ( 11 * maxAlpha + 3 * minAlpha + ALPHA_RANGE ) / 14
  1036. // ab1 = ( 13 * maxAlpha + 1 * minAlpha + ALPHA_RANGE ) / 14
  1037. // ab4 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
  1038. // ab5 = ( 5 * maxAlpha + 9 * minAlpha + ALPHA_RANGE ) / 14
  1039. // ab6 = ( 3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
  1040. // ab7 = ( 1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14
  1041. movd xmm5, maxAlpha_
  1042. pshuflw xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
  1043. pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
  1044. movdqa xmm7, xmm5
  1045. movd xmm2, minAlpha_
  1046. pshuflw xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
  1047. pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
  1048. movdqa xmm3, xmm2
  1049. pmullw xmm5, SIMD_SSE2_word_scale_7_9_11_13
  1050. pmullw xmm7, SIMD_SSE2_word_scale_7_5_3_1
  1051. pmullw xmm2, SIMD_SSE2_word_scale_7_5_3_1
  1052. pmullw xmm3, SIMD_SSE2_word_scale_7_9_11_13
  1053. paddw xmm5, xmm2
  1054. paddw xmm7, xmm3
  1055. paddw xmm5, SIMD_SSE2_word_7
  1056. paddw xmm7, SIMD_SSE2_word_7
  1057. pmulhw xmm5, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16
  1058. pmulhw xmm7, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16
  1059. pshufd xmm1, xmm5, R_SHUFFLE_D( 3, 3, 3, 3 )
  1060. pshufd xmm2, xmm5, R_SHUFFLE_D( 2, 2, 2, 2 )
  1061. pshufd xmm3, xmm5, R_SHUFFLE_D( 1, 1, 1, 1 )
  1062. packuswb xmm1, xmm1 // ab1
  1063. packuswb xmm2, xmm2 // ab2
  1064. packuswb xmm3, xmm3 // ab3
  1065. packuswb xmm0, xmm6 // alpha block
  1066. pshufd xmm4, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
  1067. pshufd xmm5, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
  1068. pshufd xmm6, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
  1069. pshufd xmm7, xmm7, R_SHUFFLE_D( 3, 3, 3, 3 )
  1070. packuswb xmm4, xmm4 // ab4
  1071. packuswb xmm5, xmm5 // ab5
  1072. packuswb xmm6, xmm6 // ab6
  1073. packuswb xmm7, xmm7 // ab7
  1074. pmaxub xmm1, xmm0
  1075. pmaxub xmm2, xmm0
  1076. pmaxub xmm3, xmm0
  1077. pcmpeqb xmm1, xmm0
  1078. pcmpeqb xmm2, xmm0
  1079. pcmpeqb xmm3, xmm0
  1080. pmaxub xmm4, xmm0
  1081. pmaxub xmm5, xmm0
  1082. pmaxub xmm6, xmm0
  1083. pmaxub xmm7, xmm0
  1084. pcmpeqb xmm4, xmm0
  1085. pcmpeqb xmm5, xmm0
  1086. pcmpeqb xmm6, xmm0
  1087. pcmpeqb xmm7, xmm0
  1088. movdqa xmm0, SIMD_SSE2_byte_8
  1089. paddsb xmm0, xmm1
  1090. paddsb xmm2, xmm3
  1091. paddsb xmm4, xmm5
  1092. paddsb xmm6, xmm7
  1093. paddsb xmm0, xmm2
  1094. paddsb xmm4, xmm6
  1095. paddsb xmm0, xmm4
  1096. pand xmm0, SIMD_SSE2_byte_7
  1097. movdqa xmm1, SIMD_SSE2_byte_2
  1098. pcmpgtb xmm1, xmm0
  1099. pand xmm1, SIMD_SSE2_byte_1
  1100. pxor xmm0, xmm1
  1101. movdqa xmm1, xmm0
  1102. movdqa xmm2, xmm0
  1103. movdqa xmm3, xmm0
  1104. movdqa xmm4, xmm0
  1105. movdqa xmm5, xmm0
  1106. movdqa xmm6, xmm0
  1107. movdqa xmm7, xmm0
  1108. psrlq xmm1, 8- 3
  1109. psrlq xmm2, 16- 6
  1110. psrlq xmm3, 24- 9
  1111. psrlq xmm4, 32-12
  1112. psrlq xmm5, 40-15
  1113. psrlq xmm6, 48-18
  1114. psrlq xmm7, 56-21
  1115. pand xmm0, SIMD_SSE2_dword_alpha_bit_mask0
  1116. pand xmm1, SIMD_SSE2_dword_alpha_bit_mask1
  1117. pand xmm2, SIMD_SSE2_dword_alpha_bit_mask2
  1118. pand xmm3, SIMD_SSE2_dword_alpha_bit_mask3
  1119. pand xmm4, SIMD_SSE2_dword_alpha_bit_mask4
  1120. pand xmm5, SIMD_SSE2_dword_alpha_bit_mask5
  1121. pand xmm6, SIMD_SSE2_dword_alpha_bit_mask6
  1122. pand xmm7, SIMD_SSE2_dword_alpha_bit_mask7
  1123. por xmm0, xmm1
  1124. por xmm2, xmm3
  1125. por xmm4, xmm5
  1126. por xmm6, xmm7
  1127. por xmm0, xmm2
  1128. por xmm4, xmm6
  1129. por xmm0, xmm4
  1130. mov esi, outPtr
  1131. movd [esi+0], xmm0
  1132. pshufd xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )
  1133. movd [esi+3], xmm1
  1134. }
  1135. outData += 6;
  1136. #elif defined ( ID_WIN_X86_SSE2_INTRIN )
  1137. __m128i block0 = *((__m128i *)(&block[ 0]));
  1138. __m128i block1 = *((__m128i *)(&block[16]));
  1139. __m128i block2 = *((__m128i *)(&block[32]));
  1140. __m128i block3 = *((__m128i *)(&block[48]));
  1141. __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  1142. temp0 = _mm_srli_epi32( block0, 24 );
  1143. temp5 = _mm_srli_epi32( block1, 24 );
  1144. temp6 = _mm_srli_epi32( block2, 24 );
  1145. temp4 = _mm_srli_epi32( block3, 24 );
  1146. temp0 = _mm_packus_epi16( temp0, temp5 );
  1147. temp6 = _mm_packus_epi16( temp6, temp4 );
  1148. //---------------------
  1149. // ab0 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
  1150. // ab3 = ( 9 * maxAlpha + 5 * minAlpha + ALPHA_RANGE ) / 14
  1151. // ab2 = ( 11 * maxAlpha + 3 * minAlpha + ALPHA_RANGE ) / 14
  1152. // ab1 = ( 13 * maxAlpha + 1 * minAlpha + ALPHA_RANGE ) / 14
  1153. // ab4 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
  1154. // ab5 = ( 5 * maxAlpha + 9 * minAlpha + ALPHA_RANGE ) / 14
  1155. // ab6 = ( 3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
  1156. // ab7 = ( 1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14
  1157. temp5 = _mm_cvtsi32_si128( maxAlpha_ );
  1158. temp5 = _mm_shufflelo_epi16( temp5, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  1159. temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  1160. temp2 = _mm_cvtsi32_si128( minAlpha_ );
  1161. temp2 = _mm_shufflelo_epi16( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  1162. temp2 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  1163. temp7 = _mm_mullo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_scale_7_5_3_1 );
  1164. temp5 = _mm_mullo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_scale_7_9_11_13 );
  1165. temp3 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_7_9_11_13 );
  1166. temp2 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_7_5_3_1 );
  1167. temp5 = _mm_add_epi16( temp5, temp2 );
  1168. temp7 = _mm_add_epi16( temp7, temp3 );
  1169. temp5 = _mm_add_epi16( temp5, (const __m128i &)SIMD_SSE2_word_7 );
  1170. temp7 = _mm_add_epi16( temp7, (const __m128i &)SIMD_SSE2_word_7 );
  1171. temp5 = _mm_mulhi_epi16( temp5, (const __m128i &)SIMD_SSE2_word_div_by_14 );
  1172. temp7 = _mm_mulhi_epi16( temp7, (const __m128i &)SIMD_SSE2_word_div_by_14 );
  1173. temp1 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 3, 3, 3, 3 ) );
  1174. temp2 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 2, 2, 2, 2 ) );
  1175. temp3 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 1, 1, 1, 1 ) );
  1176. temp1 = _mm_packus_epi16( temp1, temp1 );
  1177. temp2 = _mm_packus_epi16( temp2, temp2 );
  1178. temp3 = _mm_packus_epi16( temp3, temp3 );
  1179. temp0 = _mm_packus_epi16( temp0, temp6 );
  1180. temp4 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  1181. temp5 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 1, 1, 1, 1 ) );
  1182. temp6 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 2, 2, 2, 2 ) );
  1183. temp7 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 3, 3, 3, 3 ) );
  1184. temp4 = _mm_packus_epi16( temp4, temp4 );
  1185. temp5 = _mm_packus_epi16( temp5, temp5 );
  1186. temp6 = _mm_packus_epi16( temp6, temp6 );
  1187. temp7 = _mm_packus_epi16( temp7, temp7 );
  1188. temp1 = _mm_max_epu8( temp1, temp0 );
  1189. temp2 = _mm_max_epu8( temp2, temp0 );
  1190. temp3 = _mm_max_epu8( temp3, temp0 );
  1191. temp1 = _mm_cmpeq_epi8( temp1, temp0 );
  1192. temp2 = _mm_cmpeq_epi8( temp2, temp0 );
  1193. temp3 = _mm_cmpeq_epi8( temp3, temp0 );
  1194. temp4 = _mm_max_epu8( temp4, temp0 );
  1195. temp5 = _mm_max_epu8( temp5, temp0 );
  1196. temp6 = _mm_max_epu8( temp6, temp0 );
  1197. temp7 = _mm_max_epu8( temp7, temp0 );
  1198. temp4 = _mm_cmpeq_epi8( temp4, temp0 );
  1199. temp5 = _mm_cmpeq_epi8( temp5, temp0 );
  1200. temp6 = _mm_cmpeq_epi8( temp6, temp0 );
  1201. temp7 = _mm_cmpeq_epi8( temp7, temp0 );
  1202. temp0 = _mm_adds_epi8( (const __m128i &)SIMD_SSE2_byte_8, temp1 );
  1203. temp2 = _mm_adds_epi8( temp2, temp3 );
  1204. temp4 = _mm_adds_epi8( temp4, temp5 );
  1205. temp6 = _mm_adds_epi8( temp6, temp7 );
  1206. temp0 = _mm_adds_epi8( temp0, temp2 );
  1207. temp4 = _mm_adds_epi8( temp4, temp6 );
  1208. temp0 = _mm_adds_epi8( temp0, temp4 );
  1209. temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_byte_7 );
  1210. temp1 = _mm_cmpgt_epi8( (const __m128i &)SIMD_SSE2_byte_2, temp0 );
  1211. temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_byte_1 );
  1212. temp0 = _mm_xor_si128( temp0, temp1 );
  1213. temp1 = _mm_srli_epi64( temp0, 8 - 3 );
  1214. temp2 = _mm_srli_epi64( temp0, 16 - 6 );
  1215. temp3 = _mm_srli_epi64( temp0, 24 - 9 );
  1216. temp4 = _mm_srli_epi64( temp0, 32 - 12 );
  1217. temp5 = _mm_srli_epi64( temp0, 40 - 15 );
  1218. temp6 = _mm_srli_epi64( temp0, 48 - 18 );
  1219. temp7 = _mm_srli_epi64( temp0, 56 - 21 );
  1220. temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask0 );
  1221. temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask1 );
  1222. temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask2 );
  1223. temp3 = _mm_and_si128( temp3, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask3 );
  1224. temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask4 );
  1225. temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask5 );
  1226. temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask6 );
  1227. temp7 = _mm_and_si128( temp7, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask7 );
  1228. temp0 = _mm_or_si128( temp0, temp1 );
  1229. temp2 = _mm_or_si128( temp2, temp3 );
  1230. temp4 = _mm_or_si128( temp4, temp5 );
  1231. temp6 = _mm_or_si128( temp6, temp7 );
  1232. temp0 = _mm_or_si128( temp0, temp2 );
  1233. temp4 = _mm_or_si128( temp4, temp6 );
  1234. temp0 = _mm_or_si128( temp0, temp4 );
  1235. int out = _mm_cvtsi128_si32( temp0 );
  1236. EmitUInt( out );
  1237. outData--;
  1238. temp1 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 2, 3, 0, 1 ) );
  1239. out = _mm_cvtsi128_si32( temp1 );
  1240. EmitUInt( out );
  1241. outData--;
  1242. #else
  1243. assert( false );
  1244. #endif
  1245. }
  1246. /*
  1247. ========================
  1248. idDxtEncoder::EmitAlphaIndices_SSE2
  1249. ========================
  1250. */
  1251. void idDxtEncoder::EmitAlphaIndices_SSE2( const byte *block, const int channelBitOffset, const int minAlpha_, const int maxAlpha_ ) {
  1252. #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
  1253. assert( maxAlpha_ >= minAlpha_ );
  1254. byte *outPtr = outData;
  1255. __asm {
  1256. movd xmm7, channelBitOffset
  1257. mov esi, block
  1258. movdqa xmm0, xmmword ptr [esi+ 0]
  1259. movdqa xmm5, xmmword ptr [esi+16]
  1260. movdqa xmm6, xmmword ptr [esi+32]
  1261. movdqa xmm4, xmmword ptr [esi+48]
  1262. psrld xmm0, xmm7
  1263. psrld xmm5, xmm7
  1264. psrld xmm6, xmm7
  1265. psrld xmm4, xmm7
  1266. pand xmm0, SIMD_SSE2_dword_byte_mask
  1267. pand xmm5, SIMD_SSE2_dword_byte_mask
  1268. pand xmm6, SIMD_SSE2_dword_byte_mask
  1269. pand xmm4, SIMD_SSE2_dword_byte_mask
  1270. packuswb xmm0, xmm5
  1271. packuswb xmm6, xmm4
  1272. //---------------------
  1273. // ab0 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
  1274. // ab3 = ( 9 * maxAlpha + 5 * minAlpha + ALPHA_RANGE ) / 14
  1275. // ab2 = ( 11 * maxAlpha + 3 * minAlpha + ALPHA_RANGE ) / 14
  1276. // ab1 = ( 13 * maxAlpha + 1 * minAlpha + ALPHA_RANGE ) / 14
  1277. // ab4 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
  1278. // ab5 = ( 5 * maxAlpha + 9 * minAlpha + ALPHA_RANGE ) / 14
  1279. // ab6 = ( 3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
  1280. // ab7 = ( 1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14
  1281. movd xmm5, maxAlpha_
  1282. pshuflw xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
  1283. pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
  1284. movdqa xmm7, xmm5
  1285. movd xmm2, minAlpha_
  1286. pshuflw xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
  1287. pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
  1288. movdqa xmm3, xmm2
  1289. pmullw xmm5, SIMD_SSE2_word_scale_7_9_11_13
  1290. pmullw xmm7, SIMD_SSE2_word_scale_7_5_3_1
  1291. pmullw xmm2, SIMD_SSE2_word_scale_7_5_3_1
  1292. pmullw xmm3, SIMD_SSE2_word_scale_7_9_11_13
  1293. paddw xmm5, xmm2
  1294. paddw xmm7, xmm3
  1295. paddw xmm5, SIMD_SSE2_word_7
  1296. paddw xmm7, SIMD_SSE2_word_7
  1297. pmulhw xmm5, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16
  1298. pmulhw xmm7, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16
  1299. pshufd xmm1, xmm5, R_SHUFFLE_D( 3, 3, 3, 3 )
  1300. pshufd xmm2, xmm5, R_SHUFFLE_D( 2, 2, 2, 2 )
  1301. pshufd xmm3, xmm5, R_SHUFFLE_D( 1, 1, 1, 1 )
  1302. packuswb xmm1, xmm1 // ab1
  1303. packuswb xmm2, xmm2 // ab2
  1304. packuswb xmm3, xmm3 // ab3
  1305. packuswb xmm0, xmm6 // alpha block
  1306. pshufd xmm4, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
  1307. pshufd xmm5, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
  1308. pshufd xmm6, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
  1309. pshufd xmm7, xmm7, R_SHUFFLE_D( 3, 3, 3, 3 )
  1310. packuswb xmm4, xmm4 // ab4
  1311. packuswb xmm5, xmm5 // ab5
  1312. packuswb xmm6, xmm6 // ab6
  1313. packuswb xmm7, xmm7 // ab7
  1314. pmaxub xmm1, xmm0
  1315. pmaxub xmm2, xmm0
  1316. pmaxub xmm3, xmm0
  1317. pcmpeqb xmm1, xmm0
  1318. pcmpeqb xmm2, xmm0
  1319. pcmpeqb xmm3, xmm0
  1320. pmaxub xmm4, xmm0
  1321. pmaxub xmm5, xmm0
  1322. pmaxub xmm6, xmm0
  1323. pmaxub xmm7, xmm0
  1324. pcmpeqb xmm4, xmm0
  1325. pcmpeqb xmm5, xmm0
  1326. pcmpeqb xmm6, xmm0
  1327. pcmpeqb xmm7, xmm0
  1328. movdqa xmm0, SIMD_SSE2_byte_8
  1329. paddsb xmm0, xmm1
  1330. paddsb xmm2, xmm3
  1331. paddsb xmm4, xmm5
  1332. paddsb xmm6, xmm7
  1333. paddsb xmm0, xmm2
  1334. paddsb xmm4, xmm6
  1335. paddsb xmm0, xmm4
  1336. pand xmm0, SIMD_SSE2_byte_7
  1337. movdqa xmm1, SIMD_SSE2_byte_2
  1338. pcmpgtb xmm1, xmm0
  1339. pand xmm1, SIMD_SSE2_byte_1
  1340. pxor xmm0, xmm1
  1341. movdqa xmm1, xmm0
  1342. movdqa xmm2, xmm0
  1343. movdqa xmm3, xmm0
  1344. movdqa xmm4, xmm0
  1345. movdqa xmm5, xmm0
  1346. movdqa xmm6, xmm0
  1347. movdqa xmm7, xmm0
  1348. psrlq xmm1, 8- 3
  1349. psrlq xmm2, 16- 6
  1350. psrlq xmm3, 24- 9
  1351. psrlq xmm4, 32-12
  1352. psrlq xmm5, 40-15
  1353. psrlq xmm6, 48-18
  1354. psrlq xmm7, 56-21
  1355. pand xmm0, SIMD_SSE2_dword_alpha_bit_mask0
  1356. pand xmm1, SIMD_SSE2_dword_alpha_bit_mask1
  1357. pand xmm2, SIMD_SSE2_dword_alpha_bit_mask2
  1358. pand xmm3, SIMD_SSE2_dword_alpha_bit_mask3
  1359. pand xmm4, SIMD_SSE2_dword_alpha_bit_mask4
  1360. pand xmm5, SIMD_SSE2_dword_alpha_bit_mask5
  1361. pand xmm6, SIMD_SSE2_dword_alpha_bit_mask6
  1362. pand xmm7, SIMD_SSE2_dword_alpha_bit_mask7
  1363. por xmm0, xmm1
  1364. por xmm2, xmm3
  1365. por xmm4, xmm5
  1366. por xmm6, xmm7
  1367. por xmm0, xmm2
  1368. por xmm4, xmm6
  1369. por xmm0, xmm4
  1370. mov esi, outPtr
  1371. movd [esi+0], xmm0
  1372. pshufd xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )
  1373. movd [esi+3], xmm1
  1374. }
  1375. outData += 6;
  1376. #elif defined ( ID_WIN_X86_SSE2_INTRIN )
  1377. __m128i block0 = *((__m128i *)(&block[ 0]));
  1378. __m128i block1 = *((__m128i *)(&block[16]));
  1379. __m128i block2 = *((__m128i *)(&block[32]));
  1380. __m128i block3 = *((__m128i *)(&block[48]));
  1381. __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  1382. temp7 = _mm_cvtsi32_si128( channelBitOffset );
  1383. temp0 = _mm_srl_epi32( block0, temp7 );
  1384. temp5 = _mm_srl_epi32( block1, temp7 );
  1385. temp6 = _mm_srl_epi32( block2, temp7 );
  1386. temp4 = _mm_srl_epi32( block3, temp7 );
  1387. temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_byte_mask );
  1388. temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_byte_mask );
  1389. temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_byte_mask );
  1390. temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_byte_mask );
  1391. temp0 = _mm_packus_epi16( temp0, temp5 );
  1392. temp6 = _mm_packus_epi16( temp6, temp4 );
  1393. //---------------------
  1394. // ab0 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
  1395. // ab3 = ( 9 * maxAlpha + 5 * minAlpha + ALPHA_RANGE ) / 14
  1396. // ab2 = ( 11 * maxAlpha + 3 * minAlpha + ALPHA_RANGE ) / 14
  1397. // ab1 = ( 13 * maxAlpha + 1 * minAlpha + ALPHA_RANGE ) / 14
  1398. // ab4 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
  1399. // ab5 = ( 5 * maxAlpha + 9 * minAlpha + ALPHA_RANGE ) / 14
  1400. // ab6 = ( 3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
  1401. // ab7 = ( 1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14
  1402. temp5 = _mm_cvtsi32_si128( maxAlpha_ );
  1403. temp5 = _mm_shufflelo_epi16( temp5, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  1404. temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  1405. temp2 = _mm_cvtsi32_si128( minAlpha_ );
  1406. temp2 = _mm_shufflelo_epi16( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  1407. temp2 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  1408. temp7 = _mm_mullo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_scale_7_5_3_1 );
  1409. temp5 = _mm_mullo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_scale_7_9_11_13 );
  1410. temp3 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_7_9_11_13 );
  1411. temp2 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_7_5_3_1 );
  1412. temp5 = _mm_add_epi16( temp5, temp2 );
  1413. temp7 = _mm_add_epi16( temp7, temp3 );
  1414. temp5 = _mm_add_epi16( temp5, (const __m128i &)SIMD_SSE2_word_7 );
  1415. temp7 = _mm_add_epi16( temp7, (const __m128i &)SIMD_SSE2_word_7 );
  1416. temp5 = _mm_mulhi_epi16( temp5, (const __m128i &)SIMD_SSE2_word_div_by_14 );
  1417. temp7 = _mm_mulhi_epi16( temp7, (const __m128i &)SIMD_SSE2_word_div_by_14 );
  1418. temp1 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 3, 3, 3, 3 ) );
  1419. temp2 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 2, 2, 2, 2 ) );
  1420. temp3 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 1, 1, 1, 1 ) );
  1421. temp1 = _mm_packus_epi16( temp1, temp1 );
  1422. temp2 = _mm_packus_epi16( temp2, temp2 );
  1423. temp3 = _mm_packus_epi16( temp3, temp3 );
  1424. temp0 = _mm_packus_epi16( temp0, temp6 );
  1425. temp4 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  1426. temp5 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 1, 1, 1, 1 ) );
  1427. temp6 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 2, 2, 2, 2 ) );
  1428. temp7 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 3, 3, 3, 3 ) );
  1429. temp4 = _mm_packus_epi16( temp4, temp4 );
  1430. temp5 = _mm_packus_epi16( temp5, temp5 );
  1431. temp6 = _mm_packus_epi16( temp6, temp6 );
  1432. temp7 = _mm_packus_epi16( temp7, temp7 );
  1433. temp1 = _mm_max_epu8( temp1, temp0 );
  1434. temp2 = _mm_max_epu8( temp2, temp0 );
  1435. temp3 = _mm_max_epu8( temp3, temp0 );
  1436. temp1 = _mm_cmpeq_epi8( temp1, temp0 );
  1437. temp2 = _mm_cmpeq_epi8( temp2, temp0 );
  1438. temp3 = _mm_cmpeq_epi8( temp3, temp0 );
  1439. temp4 = _mm_max_epu8( temp4, temp0 );
  1440. temp5 = _mm_max_epu8( temp5, temp0 );
  1441. temp6 = _mm_max_epu8( temp6, temp0 );
  1442. temp7 = _mm_max_epu8( temp7, temp0 );
  1443. temp4 = _mm_cmpeq_epi8( temp4, temp0 );
  1444. temp5 = _mm_cmpeq_epi8( temp5, temp0 );
  1445. temp6 = _mm_cmpeq_epi8( temp6, temp0 );
  1446. temp7 = _mm_cmpeq_epi8( temp7, temp0 );
  1447. temp0 = _mm_adds_epi8( (const __m128i &)SIMD_SSE2_byte_8, temp1 );
  1448. temp2 = _mm_adds_epi8( temp2, temp3 );
  1449. temp4 = _mm_adds_epi8( temp4, temp5 );
  1450. temp6 = _mm_adds_epi8( temp6, temp7 );
  1451. temp0 = _mm_adds_epi8( temp0, temp2 );
  1452. temp4 = _mm_adds_epi8( temp4, temp6 );
  1453. temp0 = _mm_adds_epi8( temp0, temp4 );
  1454. temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_byte_7 );
  1455. temp1 = _mm_cmpgt_epi8( (const __m128i &)SIMD_SSE2_byte_2, temp0 );
  1456. temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_byte_1 );
  1457. temp0 = _mm_xor_si128( temp0, temp1 );
  1458. temp1 = _mm_srli_epi64( temp0, 8 - 3 );
  1459. temp2 = _mm_srli_epi64( temp0, 16 - 6 );
  1460. temp3 = _mm_srli_epi64( temp0, 24 - 9 );
  1461. temp4 = _mm_srli_epi64( temp0, 32 - 12 );
  1462. temp5 = _mm_srli_epi64( temp0, 40 - 15 );
  1463. temp6 = _mm_srli_epi64( temp0, 48 - 18 );
  1464. temp7 = _mm_srli_epi64( temp0, 56 - 21 );
  1465. temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask0 );
  1466. temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask1 );
  1467. temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask2 );
  1468. temp3 = _mm_and_si128( temp3, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask3 );
  1469. temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask4 );
  1470. temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask5 );
  1471. temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask6 );
  1472. temp7 = _mm_and_si128( temp7, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask7 );
  1473. temp0 = _mm_or_si128( temp0, temp1 );
  1474. temp2 = _mm_or_si128( temp2, temp3 );
  1475. temp4 = _mm_or_si128( temp4, temp5 );
  1476. temp6 = _mm_or_si128( temp6, temp7 );
  1477. temp0 = _mm_or_si128( temp0, temp2 );
  1478. temp4 = _mm_or_si128( temp4, temp6 );
  1479. temp0 = _mm_or_si128( temp0, temp4 );
  1480. int out = _mm_cvtsi128_si32( temp0 );
  1481. EmitUInt( out );
  1482. outData--;
  1483. temp1 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 2, 3, 0, 1 ) );
  1484. out = _mm_cvtsi128_si32( temp1 );
  1485. EmitUInt( out );
  1486. outData--;
  1487. #else
  1488. assert( false );
  1489. #endif
  1490. }
  1491. /*
  1492. ========================
  1493. idDxtEncoder::CompressImageDXT1Fast_SSE2
  1494. params: inBuf - image to compress
  1495. paramO: outBuf - result of compression
  1496. params: width - width of image
  1497. params: height - height of image
  1498. ========================
  1499. */
  1500. void idDxtEncoder::CompressImageDXT1Fast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
  1501. ALIGN16( byte block[64] );
  1502. ALIGN16( byte minColor[4] );
  1503. ALIGN16( byte maxColor[4] );
  1504. assert( width >= 4 && ( width & 3 ) == 0 );
  1505. assert( height >= 4 && ( height & 3 ) == 0 );
  1506. this->width = width;
  1507. this->height = height;
  1508. this->outData = outBuf;
  1509. for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
  1510. for ( int i = 0; i < width; i += 4 ) {
  1511. ExtractBlock_SSE2( inBuf + i * 4, width, block );
  1512. GetMinMaxBBox_SSE2( block, minColor, maxColor );
  1513. InsetColorsBBox_SSE2( minColor, maxColor );
  1514. EmitUShort( ColorTo565( maxColor ) );
  1515. EmitUShort( ColorTo565( minColor ) );
  1516. EmitColorIndices_SSE2( block, minColor, maxColor );
  1517. }
  1518. outData += dstPadding;
  1519. inBuf += srcPadding;
  1520. }
  1521. #ifdef TEST_COMPRESSION
  1522. int tmpDstPadding = dstPadding;
  1523. dstPadding = 0;
  1524. byte * testOutBuf = (byte *) _alloca16( width * height / 2 );
  1525. CompressImageDXT1Fast_Generic( inBuf, testOutBuf, width, height );
  1526. for ( int j = 0; j < height/4; j++ ) {
  1527. for ( int i = 0; i < width/4; i++ ) {
  1528. byte * ptr1 = outBuf + ( j * width/4 + i ) * 8 + j * tmpDstPadding;
  1529. byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 8;
  1530. for ( int k = 0; k < 8; k++ ) {
  1531. assert( ptr1[k] == ptr2[k] );
  1532. }
  1533. }
  1534. }
  1535. dstPadding = tmpDstPadding;
  1536. #endif
  1537. }
  1538. /*
  1539. ========================
  1540. idDxtEncoder::CompressImageDXT1AlphaFast_SSE2
  1541. params: inBuf - image to compress
  1542. paramO: outBuf - result of compression
  1543. params: width - width of image
  1544. params: height - height of image
  1545. ========================
  1546. */
  1547. void idDxtEncoder::CompressImageDXT1AlphaFast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
  1548. ALIGN16( byte block[64] );
  1549. ALIGN16( byte minColor[4] );
  1550. ALIGN16( byte maxColor[4] );
  1551. assert( width >= 4 && ( width & 3 ) == 0 );
  1552. assert( height >= 4 && ( height & 3 ) == 0 );
  1553. this->width = width;
  1554. this->height = height;
  1555. this->outData = outBuf;
  1556. for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
  1557. for ( int i = 0; i < width; i += 4 ) {
  1558. ExtractBlock_SSE2( inBuf + i * 4, width, block );
  1559. GetMinMaxBBox_SSE2( block, minColor, maxColor );
  1560. byte minAlpha = minColor[3];
  1561. InsetColorsBBox_SSE2( minColor, maxColor );
  1562. if ( minAlpha >= 128 ) {
  1563. EmitUShort( ColorTo565( maxColor ) );
  1564. EmitUShort( ColorTo565( minColor ) );
  1565. EmitColorIndices_SSE2( block, minColor, maxColor );
  1566. } else {
  1567. EmitUShort( ColorTo565( minColor ) );
  1568. EmitUShort( ColorTo565( maxColor ) );
  1569. EmitColorAlphaIndices_SSE2( block, minColor, maxColor );
  1570. }
  1571. }
  1572. outData += dstPadding;
  1573. inBuf += srcPadding;
  1574. }
  1575. #ifdef TEST_COMPRESSION
  1576. int tmpDstPadding = dstPadding;
  1577. dstPadding = 0;
  1578. byte * testOutBuf = (byte *) _alloca16( width * height / 2 );
  1579. CompressImageDXT1AlphaFast_Generic( inBuf, testOutBuf, width, height );
  1580. for ( int j = 0; j < height/4; j++ ) {
  1581. for ( int i = 0; i < width/4; i++ ) {
  1582. byte * ptr1 = outBuf + ( j * width/4 + i ) * 8 + j * tmpDstPadding;
  1583. byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 8;
  1584. for ( int k = 0; k < 8; k++ ) {
  1585. assert( ptr1[k] == ptr2[k] );
  1586. }
  1587. }
  1588. }
  1589. dstPadding = tmpDstPadding;
  1590. #endif
  1591. }
  1592. /*
  1593. ========================
  1594. idDxtEncoder::CompressImageDXT5Fast_SSE2
  1595. params: inBuf - image to compress
  1596. paramO: outBuf - result of compression
  1597. params: width - width of image
  1598. params: height - height of image
  1599. ========================
  1600. */
  1601. void idDxtEncoder::CompressImageDXT5Fast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
  1602. ALIGN16( byte block[64] );
  1603. ALIGN16( byte minColor[4] );
  1604. ALIGN16( byte maxColor[4] );
  1605. assert( width >= 4 && ( width & 3 ) == 0 );
  1606. assert( height >= 4 && ( height & 3 ) == 0 );
  1607. this->width = width;
  1608. this->height = height;
  1609. this->outData = outBuf;
  1610. for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
  1611. for ( int i = 0; i < width; i += 4 ) {
  1612. ExtractBlock_SSE2( inBuf + i * 4, width, block );
  1613. GetMinMaxBBox_SSE2( block, minColor, maxColor );
  1614. InsetColorsBBox_SSE2( minColor, maxColor );
  1615. EmitByte( maxColor[3] );
  1616. EmitByte( minColor[3] );
  1617. EmitAlphaIndices_SSE2( block, minColor[3], maxColor[3] );
  1618. EmitUShort( ColorTo565( maxColor ) );
  1619. EmitUShort( ColorTo565( minColor ) );
  1620. EmitColorIndices_SSE2( block, minColor, maxColor );
  1621. }
  1622. outData += dstPadding;
  1623. inBuf += srcPadding;
  1624. }
  1625. #ifdef TEST_COMPRESSION
  1626. int tmpDstPadding = dstPadding;
  1627. dstPadding = 0;
  1628. byte * testOutBuf = (byte *) _alloca16( width * height );
  1629. CompressImageDXT5Fast_Generic( inBuf, testOutBuf, width, height );
  1630. for ( int j = 0; j < height / 4; j++ ) {
  1631. for ( int i = 0; i < width / 4; i++ ) {
  1632. byte * ptr1 = outBuf + ( j * width/4 + i ) * 16 + j * tmpDstPadding;
  1633. byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 16;
  1634. for ( int k = 0; k < 16; k++ ) {
  1635. assert( ptr1[k] == ptr2[k] );
  1636. }
  1637. }
  1638. }
  1639. dstPadding = tmpDstPadding;
  1640. #endif
  1641. }
  1642. /*
  1643. ========================
  1644. idDxtEncoder::ScaleYCoCg_SSE2
  1645. ========================
  1646. */
  1647. ID_INLINE void idDxtEncoder::ScaleYCoCg_SSE2( byte *colorBlock, byte *minColor, byte *maxColor ) const {
  1648. #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
  1649. __asm {
  1650. mov esi, colorBlock
  1651. mov edx, minColor
  1652. mov ecx, maxColor
  1653. movd xmm0, dword ptr [edx]
  1654. movd xmm1, dword ptr [ecx]
  1655. punpcklbw xmm0, SIMD_SSE2_byte_0
  1656. punpcklbw xmm1, SIMD_SSE2_byte_0
  1657. movdqa xmm6, SIMD_SSE2_word_center_128
  1658. movdqa xmm7, SIMD_SSE2_word_center_128
  1659. psubw xmm6, xmm0
  1660. psubw xmm7, xmm1
  1661. psubw xmm0, SIMD_SSE2_word_center_128
  1662. psubw xmm1, SIMD_SSE2_word_center_128
  1663. pmaxsw xmm6, xmm0
  1664. pmaxsw xmm7, xmm1
  1665. pmaxsw xmm6, xmm7
  1666. pshuflw xmm7, xmm6, R_SHUFFLE_D( 1, 0, 1, 0 )
  1667. pmaxsw xmm6, xmm7
  1668. pshufd xmm6, xmm6, R_SHUFFLE_D( 0, 0, 0, 0 )
  1669. movdqa xmm7, xmm6
  1670. pcmpgtw xmm6, SIMD_SSE2_word_63 // mask0
  1671. pcmpgtw xmm7, SIMD_SSE2_word_31 // mask1
  1672. pandn xmm7, SIMD_SSE2_byte_2
  1673. por xmm7, SIMD_SSE2_byte_1
  1674. pandn xmm6, xmm7
  1675. movdqa xmm3, xmm6
  1676. movdqa xmm7, xmm6
  1677. pxor xmm7, SIMD_SSE2_byte_not
  1678. por xmm7, SIMD_SSE2_byte_scale_mask0 // 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00
  1679. paddw xmm6, SIMD_SSE2_byte_1
  1680. pand xmm6, SIMD_SSE2_byte_scale_mask1 // 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF
  1681. por xmm6, SIMD_SSE2_byte_scale_mask2 // 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00
  1682. movd xmm4, dword ptr [edx]
  1683. movd xmm5, dword ptr [ecx]
  1684. pand xmm4, SIMD_SSE2_byte_scale_mask3 // 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0xFF, 0xFF
  1685. pand xmm5, SIMD_SSE2_byte_scale_mask3
  1686. pslld xmm3, 3
  1687. pand xmm3, SIMD_SSE2_byte_scale_mask4 // 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00
  1688. por xmm4, xmm3
  1689. por xmm5, xmm3
  1690. paddb xmm4, SIMD_SSE2_byte_minus_128_0
  1691. paddb xmm5, SIMD_SSE2_byte_minus_128_0
  1692. pmullw xmm4, xmm6
  1693. pmullw xmm5, xmm6
  1694. pand xmm4, xmm7
  1695. pand xmm5, xmm7
  1696. psubb xmm4, SIMD_SSE2_byte_minus_128_0
  1697. psubb xmm5, SIMD_SSE2_byte_minus_128_0
  1698. movd dword ptr [edx], xmm4
  1699. movd dword ptr [ecx], xmm5
  1700. movdqa xmm0, xmmword ptr [esi+ 0*4]
  1701. movdqa xmm1, xmmword ptr [esi+ 4*4]
  1702. movdqa xmm2, xmmword ptr [esi+ 8*4]
  1703. movdqa xmm3, xmmword ptr [esi+12*4]
  1704. paddb xmm0, SIMD_SSE2_byte_minus_128_0
  1705. paddb xmm1, SIMD_SSE2_byte_minus_128_0
  1706. paddb xmm2, SIMD_SSE2_byte_minus_128_0
  1707. paddb xmm3, SIMD_SSE2_byte_minus_128_0
  1708. pmullw xmm0, xmm6
  1709. pmullw xmm1, xmm6
  1710. pmullw xmm2, xmm6
  1711. pmullw xmm3, xmm6
  1712. pand xmm0, xmm7
  1713. pand xmm1, xmm7
  1714. pand xmm2, xmm7
  1715. pand xmm3, xmm7
  1716. psubb xmm0, SIMD_SSE2_byte_minus_128_0
  1717. psubb xmm1, SIMD_SSE2_byte_minus_128_0
  1718. psubb xmm2, SIMD_SSE2_byte_minus_128_0
  1719. psubb xmm3, SIMD_SSE2_byte_minus_128_0
  1720. movdqa xmmword ptr [esi+ 0*4], xmm0
  1721. movdqa xmmword ptr [esi+ 4*4], xmm1
  1722. movdqa xmmword ptr [esi+ 8*4], xmm2
  1723. movdqa xmmword ptr [esi+12*4], xmm3
  1724. }
  1725. #elif defined ( ID_WIN_X86_SSE2_INTRIN )
  1726. __m128i block0 = *((__m128i *)(&colorBlock[ 0]));
  1727. __m128i block1 = *((__m128i *)(&colorBlock[16]));
  1728. __m128i block2 = *((__m128i *)(&colorBlock[32]));
  1729. __m128i block3 = *((__m128i *)(&colorBlock[48]));
  1730. __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  1731. temp0 = _mm_cvtsi32_si128( *(int *)minColor );
  1732. temp1 = _mm_cvtsi32_si128( *(int *)maxColor );
  1733. temp0 = _mm_unpacklo_epi8( temp0, (const __m128i &)SIMD_SSE2_byte_0 );
  1734. temp1 = _mm_unpacklo_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_0 );
  1735. // TODO: Algorithm seems to be get the absolute difference
  1736. temp6 = _mm_sub_epi16( (const __m128i &)SIMD_SSE2_word_center_128, temp0 );
  1737. temp7 = _mm_sub_epi16( (const __m128i &)SIMD_SSE2_word_center_128, temp1 );
  1738. temp0 = _mm_sub_epi16( temp0, (const __m128i &)SIMD_SSE2_word_center_128 );
  1739. temp1 = _mm_sub_epi16( temp1, (const __m128i &)SIMD_SSE2_word_center_128 );
  1740. temp6 = _mm_max_epi16( temp6, temp0 );
  1741. temp7 = _mm_max_epi16( temp7, temp1 );
  1742. temp6 = _mm_max_epi16( temp6, temp7 );
  1743. temp7 = _mm_shufflelo_epi16( temp6, R_SHUFFLE_D( 1, 0, 1, 0 ) );
  1744. temp6 = _mm_max_epi16( temp6, temp7 );
  1745. temp6 = _mm_shuffle_epi32( temp6, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  1746. temp7 = temp6;
  1747. temp6 = _mm_cmpgt_epi16( temp6, (const __m128i &)SIMD_SSE2_word_63 ); // mask0
  1748. temp7 = _mm_cmpgt_epi16( temp7, (const __m128i &)SIMD_SSE2_word_31 ); // mask1
  1749. temp7 = _mm_andnot_si128( temp7, (const __m128i &)SIMD_SSE2_byte_2 );
  1750. temp7 = _mm_or_si128( temp7, (const __m128i &)SIMD_SSE2_byte_1 );
  1751. temp6 = _mm_andnot_si128( temp6, temp7 );
  1752. temp3 = temp6;
  1753. temp7 = temp6;
  1754. temp7 = _mm_xor_si128( temp7, (const __m128i &)SIMD_SSE2_byte_not );
  1755. temp7 = _mm_or_si128( temp7, (const __m128i &)SIMD_SSE2_byte_scale_mask0 ); // 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00
  1756. temp6 = _mm_add_epi16( temp6, (const __m128i &)SIMD_SSE2_byte_1 );
  1757. temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_byte_scale_mask1 ); // 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF
  1758. temp6 = _mm_or_si128( temp6, (const __m128i &)SIMD_SSE2_byte_scale_mask2 ); // 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00
  1759. // TODO: remove this second store
  1760. temp4 = _mm_cvtsi32_si128( *(int *)minColor );
  1761. temp5 = _mm_cvtsi32_si128( *(int *)maxColor );
  1762. temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_byte_scale_mask3 ); // 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0xFF, 0xFF
  1763. temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_byte_scale_mask3 );
  1764. temp3 = _mm_slli_epi32( temp3, 3 );
  1765. temp3 = _mm_and_si128( temp3, (const __m128i &)SIMD_SSE2_byte_scale_mask4 ); // 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00
  1766. temp4 = _mm_or_si128( temp4, temp3 );
  1767. temp5 = _mm_or_si128( temp5, temp3 );
  1768. temp4 = _mm_add_epi8( temp4, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
  1769. temp5 = _mm_add_epi8( temp5, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
  1770. temp4 = _mm_mullo_epi16( temp4, temp6 );
  1771. temp5 = _mm_mullo_epi16( temp5, temp6 );
  1772. temp4 = _mm_and_si128( temp4, temp7 );
  1773. temp5 = _mm_and_si128( temp5, temp7 );
  1774. temp4 = _mm_sub_epi8( temp4, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
  1775. temp5 = _mm_sub_epi8( temp5, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
  1776. *(int *)minColor = _mm_cvtsi128_si32( temp4 );
  1777. *(int *)maxColor = _mm_cvtsi128_si32( temp5 );
  1778. temp0 = _mm_add_epi8( block0, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
  1779. temp1 = _mm_add_epi8( block1, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
  1780. temp2 = _mm_add_epi8( block2, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
  1781. temp3 = _mm_add_epi8( block3, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
  1782. temp0 = _mm_mullo_epi16( temp0, temp6 );
  1783. temp1 = _mm_mullo_epi16( temp1, temp6 );
  1784. temp2 = _mm_mullo_epi16( temp2, temp6 );
  1785. temp3 = _mm_mullo_epi16( temp3, temp6 );
  1786. temp0 = _mm_and_si128( temp0, temp7 );
  1787. temp1 = _mm_and_si128( temp1, temp7 );
  1788. temp2 = _mm_and_si128( temp2, temp7 );
  1789. temp3 = _mm_and_si128( temp3, temp7 );
  1790. *((__m128i *)(&colorBlock[ 0])) = _mm_sub_epi8( temp0, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
  1791. *((__m128i *)(&colorBlock[16])) = _mm_sub_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
  1792. *((__m128i *)(&colorBlock[32])) = _mm_sub_epi8( temp2, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
  1793. *((__m128i *)(&colorBlock[48])) = _mm_sub_epi8( temp3, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
  1794. #else
  1795. assert( false );
  1796. #endif
  1797. }
  1798. /*
  1799. ========================
  1800. idDxtEncoder::InsetYCoCgBBox_SSE2
  1801. ========================
  1802. */
  1803. ID_INLINE void idDxtEncoder::InsetYCoCgBBox_SSE2( byte *minColor, byte *maxColor ) const {
  1804. #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
  1805. __asm {
  1806. mov esi, minColor
  1807. mov edi, maxColor
  1808. movd xmm0, dword ptr [esi]
  1809. movd xmm1, dword ptr [edi]
  1810. punpcklbw xmm0, SIMD_SSE2_byte_0
  1811. punpcklbw xmm1, SIMD_SSE2_byte_0
  1812. movdqa xmm2, xmm1
  1813. psubw xmm2, xmm0
  1814. psubw xmm2, SIMD_SSE2_word_insetYCoCgRound
  1815. pand xmm2, SIMD_SSE2_word_insetYCoCgMask
  1816. pmullw xmm0, SIMD_SSE2_word_insetYCoCgShiftUp
  1817. pmullw xmm1, SIMD_SSE2_word_insetYCoCgShiftUp
  1818. paddw xmm0, xmm2
  1819. psubw xmm1, xmm2
  1820. pmulhw xmm0, SIMD_SSE2_word_insetYCoCgShiftDown
  1821. pmulhw xmm1, SIMD_SSE2_word_insetYCoCgShiftDown
  1822. pmaxsw xmm0, SIMD_SSE2_word_0
  1823. pmaxsw xmm1, SIMD_SSE2_word_0
  1824. pand xmm0, SIMD_SSE2_word_insetYCoCgQuantMask
  1825. pand xmm1, SIMD_SSE2_word_insetYCoCgQuantMask
  1826. movdqa xmm2, xmm0
  1827. movdqa xmm3, xmm1
  1828. pmulhw xmm2, SIMD_SSE2_word_insetYCoCgRep
  1829. pmulhw xmm3, SIMD_SSE2_word_insetYCoCgRep
  1830. por xmm0, xmm2
  1831. por xmm1, xmm3
  1832. packuswb xmm0, xmm0
  1833. packuswb xmm1, xmm1
  1834. movd dword ptr [esi], xmm0
  1835. movd dword ptr [edi], xmm1
  1836. }
  1837. #elif defined ( ID_WIN_X86_SSE2_INTRIN )
  1838. __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  1839. temp0 = _mm_cvtsi32_si128( *(int *)minColor );
  1840. temp1 = _mm_cvtsi32_si128( *(int *)maxColor );
  1841. temp0 = _mm_unpacklo_epi8( temp0, (const __m128i &)SIMD_SSE2_byte_0 );
  1842. temp1 = _mm_unpacklo_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_0 );
  1843. temp2 = _mm_sub_epi16( temp1, temp0 );
  1844. temp2 = _mm_sub_epi16( temp2, (const __m128i &)SIMD_SSE2_word_insetYCoCgRound );
  1845. temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_insetYCoCgMask );
  1846. temp0 = _mm_mullo_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetYCoCgShiftUp );
  1847. temp1 = _mm_mullo_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetYCoCgShiftUp );
  1848. temp0 = _mm_add_epi16( temp0, temp2 );
  1849. temp1 = _mm_sub_epi16( temp1, temp2 );
  1850. temp0 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetYCoCgShiftDown );
  1851. temp1 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetYCoCgShiftDown );
  1852. temp0 = _mm_max_epi16( temp0, (const __m128i &)SIMD_SSE2_word_0 );
  1853. temp1 = _mm_max_epi16( temp1, (const __m128i &)SIMD_SSE2_word_0 );
  1854. temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_word_insetYCoCgQuantMask );
  1855. temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_word_insetYCoCgQuantMask );
  1856. temp2 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetYCoCgRep );
  1857. temp3 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetYCoCgRep );
  1858. temp0 = _mm_or_si128( temp0, temp2 );
  1859. temp1 = _mm_or_si128( temp1, temp3 );
  1860. temp0 = _mm_packus_epi16( temp0, temp0 );
  1861. temp1 = _mm_packus_epi16( temp1, temp1 );
  1862. *(int *)minColor = _mm_cvtsi128_si32( temp0 );
  1863. *(int *)maxColor = _mm_cvtsi128_si32( temp1 );
  1864. #else
  1865. assert( false );
  1866. #endif
  1867. }
  1868. /*
  1869. ========================
  1870. idDxtEncoder::SelectYCoCgDiagonal_SSE2
  1871. params: colorBlock - 16 pixel block to find color indexes for
  1872. paramO: minColor - min color found
  1873. paramO: maxColor - max color found
  1874. return: diagonal to use
  1875. ========================
  1876. */
  1877. ID_INLINE void idDxtEncoder::SelectYCoCgDiagonal_SSE2( const byte *colorBlock, byte *minColor, byte *maxColor ) const {
  1878. #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
  1879. __asm {
  1880. mov esi, colorBlock
  1881. mov edx, minColor
  1882. mov ecx, maxColor
  1883. movdqa xmm0, xmmword ptr [esi+ 0]
  1884. movdqa xmm1, xmmword ptr [esi+16]
  1885. movdqa xmm2, xmmword ptr [esi+32]
  1886. movdqa xmm3, xmmword ptr [esi+48]
  1887. pand xmm0, SIMD_SSE2_dword_word_mask
  1888. pand xmm1, SIMD_SSE2_dword_word_mask
  1889. pand xmm2, SIMD_SSE2_dword_word_mask
  1890. pand xmm3, SIMD_SSE2_dword_word_mask
  1891. pslldq xmm1, 2
  1892. pslldq xmm3, 2
  1893. por xmm0, xmm1
  1894. por xmm2, xmm3
  1895. movd xmm1, dword ptr [edx] // minColor
  1896. movd xmm3, dword ptr [ecx] // maxColor
  1897. movdqa xmm6, xmm1
  1898. movdqa xmm7, xmm3
  1899. pavgb xmm1, xmm3
  1900. pshuflw xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 )
  1901. pshufd xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 )
  1902. movdqa xmm3, xmm1
  1903. pmaxub xmm1, xmm0
  1904. pmaxub xmm3, xmm2
  1905. pcmpeqb xmm1, xmm0
  1906. pcmpeqb xmm3, xmm2
  1907. movdqa xmm0, xmm1
  1908. movdqa xmm2, xmm3
  1909. psrldq xmm0, 1
  1910. psrldq xmm2, 1
  1911. pxor xmm0, xmm1
  1912. pxor xmm2, xmm3
  1913. pand xmm0, SIMD_SSE2_word_1
  1914. pand xmm2, SIMD_SSE2_word_1
  1915. paddw xmm0, xmm2
  1916. psadbw xmm0, SIMD_SSE2_byte_0
  1917. pshufd xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )
  1918. #ifdef NVIDIA_7X_HARDWARE_BUG_FIX
  1919. paddw xmm1, xmm0 // side
  1920. pcmpgtw xmm1, SIMD_SSE2_word_8 // mask = -( side > 8 )
  1921. pand xmm1, SIMD_SSE2_byte_diagonalMask
  1922. movdqa xmm0, xmm6
  1923. pcmpeqb xmm0, xmm7 // mask &= -( minColor[0] != maxColor[0] )
  1924. pslldq xmm0, 1
  1925. pandn xmm0, xmm1
  1926. #else
  1927. paddw xmm0, xmm1 // side
  1928. pcmpgtw xmm0, SIMD_SSE2_word_8 // mask = -( side > 8 )
  1929. pand xmm0, SIMD_SSE2_byte_diagonalMask
  1930. #endif
  1931. pxor xmm6, xmm7
  1932. pand xmm0, xmm6
  1933. pxor xmm7, xmm0
  1934. pxor xmm6, xmm7
  1935. movd dword ptr [edx], xmm6
  1936. movd dword ptr [ecx], xmm7
  1937. }
  1938. #elif defined ( ID_WIN_X86_SSE2_INTRIN )
  1939. __m128i block0 = *((__m128i *)(&colorBlock[ 0]));
  1940. __m128i block1 = *((__m128i *)(&colorBlock[16]));
  1941. __m128i block2 = *((__m128i *)(&colorBlock[32]));
  1942. __m128i block3 = *((__m128i *)(&colorBlock[48]));
  1943. __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  1944. temp0 = _mm_and_si128( block0, (const __m128i &)SIMD_SSE2_dword_word_mask );
  1945. temp1 = _mm_and_si128( block1, (const __m128i &)SIMD_SSE2_dword_word_mask );
  1946. temp2 = _mm_and_si128( block2, (const __m128i &)SIMD_SSE2_dword_word_mask );
  1947. temp3 = _mm_and_si128( block3, (const __m128i &)SIMD_SSE2_dword_word_mask );
  1948. temp1 = _mm_slli_si128( temp1, 2 );
  1949. temp3 = _mm_slli_si128( temp3, 2 );
  1950. temp0 = _mm_or_si128( temp0, temp1 );
  1951. temp2 = _mm_or_si128( temp2, temp3 );
  1952. temp6 = _mm_cvtsi32_si128( *(int *)minColor );
  1953. temp7 = _mm_cvtsi32_si128( *(int *)maxColor );
  1954. temp1 = _mm_avg_epu8( temp6, temp7 );
  1955. temp1 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  1956. temp1 = _mm_shuffle_epi32( temp1, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  1957. temp3 = _mm_max_epu8( temp1, temp2 );
  1958. temp1 = _mm_max_epu8( temp1, temp0 );
  1959. temp1 = _mm_cmpeq_epi8( temp1, temp0 );
  1960. temp3 = _mm_cmpeq_epi8( temp3, temp2 );
  1961. temp0 = _mm_srli_si128( temp1, 1 );
  1962. temp2 = _mm_srli_si128( temp3, 1 );
  1963. temp0 = _mm_xor_si128( temp0, temp1 );
  1964. temp2 = _mm_xor_si128( temp2, temp3 );
  1965. temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_word_1 );
  1966. temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_1 );
  1967. temp0 = _mm_add_epi16( temp0, temp2 );
  1968. temp0 = _mm_sad_epu8( temp0, (const __m128i &)SIMD_SSE2_byte_0 );
  1969. temp1 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 2, 3, 0, 1 ) );
  1970. #ifdef NVIDIA_7X_HARDWARE_BUG_FIX
  1971. temp1 = _mm_add_epi16( temp1, temp0 );
  1972. temp1 = _mm_cmpgt_epi16( temp1, (const __m128i &)SIMD_SSE2_word_8 );
  1973. temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_byte_diagonalMask );
  1974. temp0 = _mm_cmpeq_epi8( temp6, temp7 );
  1975. temp0 = _mm_slli_si128( temp0, 1 );
  1976. temp0 = _mm_andnot_si128( temp0, temp1 );
  1977. #else
  1978. temp0 = _mm_add_epi16( temp0, temp1 );
  1979. temp0 = _mm_cmpgt_epi16( temp0, (const __m128i &)SIMD_SSE2_word_8 );
  1980. temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_byte_diagonalMask );
  1981. #endif
  1982. temp6 = _mm_xor_si128( temp6, temp7 );
  1983. temp0 = _mm_and_si128( temp0, temp6 );
  1984. temp7 = _mm_xor_si128( temp7, temp0 );
  1985. temp6 = _mm_xor_si128( temp6, temp7 );
  1986. *(int *)minColor = _mm_cvtsi128_si32( temp6 );
  1987. *(int *)maxColor = _mm_cvtsi128_si32( temp7 );
  1988. #else
  1989. assert( false );
  1990. #endif
  1991. }
  1992. /*
  1993. ========================
  1994. idDxtEncoder::CompressYCoCgDXT5Fast_SSE2
  1995. params: inBuf - image to compress
  1996. paramO: outBuf - result of compression
  1997. params: width - width of image
  1998. params: height - height of image
  1999. ========================
  2000. */
  2001. void idDxtEncoder::CompressYCoCgDXT5Fast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
  2002. ALIGN16( byte block[64] );
  2003. ALIGN16( byte minColor[4] );
  2004. ALIGN16( byte maxColor[4] );
  2005. //assert( HasConstantValuePer4x4Block( inBuf, width, height, 2 ) );
  2006. assert( width >= 4 && ( width & 3 ) == 0 );
  2007. assert( height >= 4 && ( height & 3 ) == 0 );
  2008. this->width = width;
  2009. this->height = height;
  2010. this->outData = outBuf;
  2011. for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
  2012. for ( int i = 0; i < width; i += 4 ) {
  2013. ExtractBlock_SSE2( inBuf + i * 4, width, block );
  2014. GetMinMaxBBox_SSE2( block, minColor, maxColor );
  2015. ScaleYCoCg_SSE2( block, minColor, maxColor );
  2016. InsetYCoCgBBox_SSE2( minColor, maxColor );
  2017. SelectYCoCgDiagonal_SSE2( block, minColor, maxColor );
  2018. EmitByte( maxColor[3] );
  2019. EmitByte( minColor[3] );
  2020. EmitAlphaIndices_SSE2( block, minColor[3], maxColor[3] );
  2021. EmitUShort( ColorTo565( maxColor ) );
  2022. EmitUShort( ColorTo565( minColor ) );
  2023. EmitCoCgIndices_SSE2( block, minColor, maxColor );
  2024. }
  2025. outData += dstPadding;
  2026. inBuf += srcPadding;
  2027. }
  2028. #ifdef TEST_COMPRESSION
  2029. int tmpDstPadding = dstPadding;
  2030. dstPadding = 0;
  2031. byte * testOutBuf = (byte *) _alloca16( width * height );
  2032. CompressYCoCgDXT5Fast_Generic( inBuf, testOutBuf, width, height );
  2033. for ( int j = 0; j < height / 4; j++ ) {
  2034. for ( int i = 0; i < width / 4; i++ ) {
  2035. byte * ptr1 = outBuf + ( j * width/4 + i ) * 16 + j * tmpDstPadding;
  2036. byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 16;
  2037. for ( int k = 0; k < 16; k++ ) {
  2038. assert( ptr1[k] == ptr2[k] );
  2039. }
  2040. }
  2041. }
  2042. dstPadding = tmpDstPadding;
  2043. #endif
  2044. }
  2045. /*
  2046. ========================
  2047. idDxtEncoder::EmitGreenIndices_SSE2
  2048. params: block - 16-normal block for which to find normal Y indices
  2049. paramO: minGreen - Minimal normal Y found
  2050. paramO: maxGreen - Maximal normal Y found
  2051. ========================
  2052. */
  2053. void idDxtEncoder::EmitGreenIndices_SSE2( const byte *block, const int channelBitOffset, const int minGreen, const int maxGreen ) {
  2054. #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
  2055. assert( maxGreen >= minGreen );
  2056. byte *outPtr = outData;
  2057. __asm {
  2058. movd xmm7, channelBitOffset
  2059. mov esi, block
  2060. movdqa xmm0, xmmword ptr [esi+ 0]
  2061. movdqa xmm5, xmmword ptr [esi+16]
  2062. movdqa xmm6, xmmword ptr [esi+32]
  2063. movdqa xmm4, xmmword ptr [esi+48]
  2064. psrld xmm0, xmm7
  2065. psrld xmm5, xmm7
  2066. psrld xmm6, xmm7
  2067. psrld xmm4, xmm7
  2068. pand xmm0, SIMD_SSE2_dword_byte_mask
  2069. pand xmm5, SIMD_SSE2_dword_byte_mask
  2070. pand xmm6, SIMD_SSE2_dword_byte_mask
  2071. pand xmm4, SIMD_SSE2_dword_byte_mask
  2072. packuswb xmm0, xmm5
  2073. packuswb xmm6, xmm4
  2074. //---------------------
  2075. movd xmm2, maxGreen
  2076. pshuflw xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
  2077. movd xmm3, minGreen
  2078. pshuflw xmm3, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 )
  2079. pmullw xmm2, SIMD_SSE2_word_scale_5_3_1
  2080. pmullw xmm3, SIMD_SSE2_word_scale_1_3_5
  2081. paddw xmm2, SIMD_SSE2_word_3
  2082. paddw xmm3, xmm2
  2083. pmulhw xmm3, SIMD_SSE2_word_div_by_6
  2084. pshuflw xmm1, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 )
  2085. pshuflw xmm2, xmm3, R_SHUFFLE_D( 1, 1, 1, 1 )
  2086. pshuflw xmm3, xmm3, R_SHUFFLE_D( 2, 2, 2, 2 )
  2087. pshufd xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 )
  2088. pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
  2089. pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 )
  2090. packuswb xmm1, xmm1
  2091. packuswb xmm2, xmm2
  2092. packuswb xmm3, xmm3
  2093. packuswb xmm0, xmm6
  2094. pmaxub xmm1, xmm0
  2095. pmaxub xmm2, xmm0
  2096. pmaxub xmm3, xmm0
  2097. pcmpeqb xmm1, xmm0
  2098. pcmpeqb xmm2, xmm0
  2099. pcmpeqb xmm3, xmm0
  2100. movdqa xmm0, SIMD_SSE2_byte_4
  2101. paddsb xmm0, xmm1
  2102. paddsb xmm2, xmm3
  2103. paddsb xmm0, xmm2
  2104. pand xmm0, SIMD_SSE2_byte_3
  2105. movdqa xmm4, SIMD_SSE2_byte_2
  2106. pcmpgtb xmm4, xmm0
  2107. pand xmm4, SIMD_SSE2_byte_1
  2108. pxor xmm0, xmm4
  2109. movdqa xmm4, xmm0
  2110. movdqa xmm5, xmm0
  2111. movdqa xmm6, xmm0
  2112. movdqa xmm7, xmm0
  2113. psrlq xmm4, 8- 2
  2114. psrlq xmm5, 16- 4
  2115. psrlq xmm6, 24- 6
  2116. psrlq xmm7, 32- 8
  2117. pand xmm4, SIMD_SSE2_dword_color_bit_mask1
  2118. pand xmm5, SIMD_SSE2_dword_color_bit_mask2
  2119. pand xmm6, SIMD_SSE2_dword_color_bit_mask3
  2120. pand xmm7, SIMD_SSE2_dword_color_bit_mask4
  2121. por xmm5, xmm4
  2122. por xmm7, xmm6
  2123. por xmm7, xmm5
  2124. movdqa xmm4, xmm0
  2125. movdqa xmm5, xmm0
  2126. movdqa xmm6, xmm0
  2127. psrlq xmm4, 40-10
  2128. psrlq xmm5, 48-12
  2129. psrlq xmm6, 56-14
  2130. pand xmm0, SIMD_SSE2_dword_color_bit_mask0
  2131. pand xmm4, SIMD_SSE2_dword_color_bit_mask5
  2132. pand xmm5, SIMD_SSE2_dword_color_bit_mask6
  2133. pand xmm6, SIMD_SSE2_dword_color_bit_mask7
  2134. por xmm4, xmm5
  2135. por xmm0, xmm6
  2136. por xmm7, xmm4
  2137. por xmm7, xmm0
  2138. mov esi, outPtr
  2139. pshufd xmm7, xmm7, R_SHUFFLE_D( 0, 2, 1, 3 )
  2140. pshuflw xmm7, xmm7, R_SHUFFLE_D( 0, 2, 1, 3 )
  2141. movd [esi], xmm7
  2142. }
  2143. outData += 4;
  2144. #elif defined ( ID_WIN_X86_SSE2_INTRIN )
  2145. __m128i block0 = *((__m128i *)(&block[ 0]));
  2146. __m128i block1 = *((__m128i *)(&block[16]));
  2147. __m128i block2 = *((__m128i *)(&block[32]));
  2148. __m128i block3 = *((__m128i *)(&block[48]));
  2149. __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  2150. temp7 = _mm_cvtsi32_si128( channelBitOffset );
  2151. temp0 = _mm_srl_epi32( block0, temp7 );
  2152. temp5 = _mm_srl_epi32( block1, temp7 );
  2153. temp6 = _mm_srl_epi32( block2, temp7 );
  2154. temp4 = _mm_srl_epi32( block3, temp7 );
  2155. temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_byte_mask );
  2156. temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_byte_mask );
  2157. temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_byte_mask );
  2158. temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_byte_mask );
  2159. temp0 = _mm_packus_epi16( temp0, temp5 );
  2160. temp6 = _mm_packus_epi16( temp6, temp4 );
  2161. //---------------------
  2162. temp2 = _mm_cvtsi32_si128( maxGreen );
  2163. temp2 = _mm_shufflelo_epi16( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  2164. temp3 = _mm_cvtsi32_si128( minGreen );
  2165. temp3 = _mm_shufflelo_epi16( temp3, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  2166. temp2 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_5_3_1 );
  2167. temp3 = _mm_mullo_epi16( temp3, (const __m128i &)SIMD_SSE2_word_scale_1_3_5 );
  2168. temp2 = _mm_add_epi16( temp2, (const __m128i &)SIMD_SSE2_word_3 );
  2169. temp3 = _mm_add_epi16( temp3, temp2 );
  2170. temp3 = _mm_mulhi_epi16( temp3, (const __m128i &)SIMD_SSE2_word_div_by_6 );
  2171. temp1 = _mm_shufflelo_epi16( temp3, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  2172. temp2 = _mm_shufflelo_epi16( temp3, R_SHUFFLE_D( 1, 1, 1, 1 ) );
  2173. temp3 = _mm_shufflelo_epi16( temp3, R_SHUFFLE_D( 2, 2, 2, 2 ) );
  2174. temp1 = _mm_shuffle_epi32( temp1, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  2175. temp2 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  2176. temp3 = _mm_shuffle_epi32( temp3, R_SHUFFLE_D( 0, 0, 0, 0 ) );
  2177. temp1 = _mm_packus_epi16( temp1, temp1 );
  2178. temp2 = _mm_packus_epi16( temp2, temp2 );
  2179. temp3 = _mm_packus_epi16( temp3, temp3 );
  2180. temp0 = _mm_packus_epi16( temp0, temp6 );
  2181. temp1 = _mm_max_epu8( temp1, temp0 );
  2182. temp2 = _mm_max_epu8( temp2, temp0 );
  2183. temp3 = _mm_max_epu8( temp3, temp0 );
  2184. temp1 = _mm_cmpeq_epi8( temp1, temp0 );
  2185. temp2 = _mm_cmpeq_epi8( temp2, temp0 );
  2186. temp3 = _mm_cmpeq_epi8( temp3, temp0 );
  2187. temp0 = (const __m128i &)SIMD_SSE2_byte_4;
  2188. temp0 = _mm_adds_epi8( temp0, temp1 );
  2189. temp2 = _mm_adds_epi8( temp2, temp3 );
  2190. temp0 = _mm_adds_epi8( temp0, temp2 );
  2191. temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_byte_3 );
  2192. temp4 = (const __m128i &)SIMD_SSE2_byte_2;
  2193. temp4 = _mm_cmpgt_epi8( temp4, temp0 );
  2194. temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_byte_1 );
  2195. temp0 = _mm_xor_si128( temp0, temp4 );
  2196. temp4 = _mm_srli_epi64( temp0, 8 - 2 );
  2197. temp5 = _mm_srli_epi64( temp0, 16 - 4 );
  2198. temp6 = _mm_srli_epi64( temp0, 24 - 6 );
  2199. temp7 = _mm_srli_epi64( temp0, 32 - 8 );
  2200. temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_color_bit_mask1 );
  2201. temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_color_bit_mask2 );
  2202. temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_color_bit_mask3 );
  2203. temp7 = _mm_and_si128( temp7, (const __m128i &)SIMD_SSE2_dword_color_bit_mask4 );
  2204. temp5 = _mm_or_si128( temp5, temp4 );
  2205. temp7 = _mm_or_si128( temp7, temp6 );
  2206. temp7 = _mm_or_si128( temp7, temp5 );
  2207. temp4 = _mm_srli_epi64( temp0, 40 - 10 );
  2208. temp5 = _mm_srli_epi64( temp0, 48 - 12 );
  2209. temp6 = _mm_srli_epi64( temp0, 56 - 14 );
  2210. temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_color_bit_mask0 );
  2211. temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_color_bit_mask5 );
  2212. temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_color_bit_mask6 );
  2213. temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_color_bit_mask7 );
  2214. temp4 = _mm_or_si128( temp4, temp5 );
  2215. temp0 = _mm_or_si128( temp0, temp6 );
  2216. temp7 = _mm_or_si128( temp7, temp4 );
  2217. temp7 = _mm_or_si128( temp7, temp0 );
  2218. temp7 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 0, 2, 1, 3 ) );
  2219. temp7 = _mm_shufflelo_epi16( temp7, R_SHUFFLE_D( 0, 2, 1, 3 ) );
  2220. int result = _mm_cvtsi128_si32( temp7 );
  2221. EmitUInt( result );
  2222. #else
  2223. assert( false );
  2224. #endif
  2225. }
  2226. /*
  2227. ========================
  2228. idDxtEncoder::InsetNormalsBBoxDXT5_SSE2
  2229. ========================
  2230. */
  2231. void idDxtEncoder::InsetNormalsBBoxDXT5_SSE2( byte *minNormal, byte *maxNormal ) const {
  2232. #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
  2233. __asm {
  2234. mov esi, minNormal
  2235. mov edi, maxNormal
  2236. movd xmm0, dword ptr [esi] // xmm0 = minNormal
  2237. movd xmm1, dword ptr [edi] // xmm1 = maxNormal
  2238. punpcklbw xmm0, SIMD_SSE2_byte_0
  2239. punpcklbw xmm1, SIMD_SSE2_byte_0
  2240. movdqa xmm2, xmm1
  2241. psubw xmm2, xmm0
  2242. psubw xmm2, SIMD_SSE2_word_insetNormalDXT5Round
  2243. pand xmm2, SIMD_SSE2_word_insetNormalDXT5Mask // xmm2 = inset (1 & 3)
  2244. pmullw xmm0, SIMD_SSE2_word_insetNormalDXT5ShiftUp
  2245. pmullw xmm1, SIMD_SSE2_word_insetNormalDXT5ShiftUp
  2246. paddw xmm0, xmm2
  2247. psubw xmm1, xmm2
  2248. pmulhw xmm0, SIMD_SSE2_word_insetNormalDXT5ShiftDown // xmm0 = mini
  2249. pmulhw xmm1, SIMD_SSE2_word_insetNormalDXT5ShiftDown // xmm1 = maxi
  2250. // mini and maxi must be >= 0 and <= 255
  2251. pmaxsw xmm0, SIMD_SSE2_word_0
  2252. pmaxsw xmm1, SIMD_SSE2_word_0
  2253. pminsw xmm0, SIMD_SSE2_word_255
  2254. pminsw xmm1, SIMD_SSE2_word_255
  2255. movdqa xmm2, xmm0
  2256. movdqa xmm3, xmm1
  2257. pand xmm0, SIMD_SSE2_word_insetNormalDXT5QuantMask
  2258. pand xmm1, SIMD_SSE2_word_insetNormalDXT5QuantMask
  2259. pmulhw xmm2, SIMD_SSE2_word_insetNormalDXT5Rep
  2260. pmulhw xmm3, SIMD_SSE2_word_insetNormalDXT5Rep
  2261. por xmm0, xmm2
  2262. por xmm1, xmm3
  2263. packuswb xmm0, xmm0
  2264. packuswb xmm1, xmm1
  2265. movd dword ptr [esi], xmm0
  2266. movd dword ptr [edi], xmm1
  2267. }
  2268. #elif defined ( ID_WIN_X86_SSE2_INTRIN )
  2269. __m128i temp0, temp1, temp2, temp3;
  2270. temp0 = _mm_cvtsi32_si128( *(int *)minNormal );
  2271. temp1 = _mm_cvtsi32_si128( *(int *)maxNormal );
  2272. temp0 = _mm_unpacklo_epi8( temp0, (const __m128i &)SIMD_SSE2_byte_0 );
  2273. temp1 = _mm_unpacklo_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_0 );
  2274. temp2 = _mm_sub_epi16( temp1, temp0 );
  2275. temp2 = _mm_sub_epi16( temp2, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5Round );
  2276. temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5Mask ); // xmm2 = inset (1 & 3)
  2277. temp0 = _mm_mullo_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5ShiftUp );
  2278. temp1 = _mm_mullo_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5ShiftUp );
  2279. temp0 = _mm_add_epi16( temp0, temp2 );
  2280. temp1 = _mm_sub_epi16( temp1, temp2 );
  2281. temp0 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5ShiftDown ); // xmm0 = mini
  2282. temp1 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5ShiftDown ); // xmm1 = maxi
  2283. // mini and maxi must be >= 0 and <= 255
  2284. temp0 = _mm_max_epi16( temp0, (const __m128i &)SIMD_SSE2_word_0 );
  2285. temp1 = _mm_max_epi16( temp1, (const __m128i &)SIMD_SSE2_word_0 );
  2286. temp0 = _mm_min_epi16( temp0, (const __m128i &)SIMD_SSE2_word_255 );
  2287. temp1 = _mm_min_epi16( temp1, (const __m128i &)SIMD_SSE2_word_255 );
  2288. temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5QuantMask );
  2289. temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5QuantMask );
  2290. temp2 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5Rep );
  2291. temp3 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5Rep );
  2292. temp0 = _mm_or_si128( temp0, temp2 );
  2293. temp1 = _mm_or_si128( temp1, temp3 );
  2294. temp0 = _mm_packus_epi16( temp0, temp0 );
  2295. temp1 = _mm_packus_epi16( temp1, temp1 );
  2296. *(int *)minNormal = _mm_cvtsi128_si32( temp0 );
  2297. *(int *)maxNormal = _mm_cvtsi128_si32( temp1 );
  2298. #else
  2299. assert( false );
  2300. #endif
  2301. }
  2302. /*
  2303. ========================
  2304. idDxtEncoder::CompressNormalMapDXT5Fast_SSE2
  2305. params: inBuf - image to compress in _y_x component order
  2306. paramO: outBuf - result of compression
  2307. params: width - width of image
  2308. params: height - height of image
  2309. ========================
  2310. */
  2311. void idDxtEncoder::CompressNormalMapDXT5Fast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
  2312. ALIGN16( byte block[64] );
  2313. ALIGN16( byte normal1[4] );
  2314. ALIGN16( byte normal2[4] );
  2315. assert( width >= 4 && ( width & 3 ) == 0 );
  2316. assert( height >= 4 && ( height & 3 ) == 0 );
  2317. this->width = width;
  2318. this->height = height;
  2319. this->outData = outBuf;
  2320. for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
  2321. for ( int i = 0; i < width; i += 4 ) {
  2322. ExtractBlock_SSE2( inBuf + i * 4, width, block );
  2323. GetMinMaxBBox_SSE2( block, normal1, normal2 );
  2324. InsetNormalsBBoxDXT5_SSE2( normal1, normal2 );
  2325. // Write out Nx into alpha channel.
  2326. EmitByte( normal2[3] );
  2327. EmitByte( normal1[3] );
  2328. EmitAlphaIndices_SSE2( block, 3*8, normal1[3], normal2[3] );
  2329. // Write out Ny into green channel.
  2330. EmitUShort( ColorTo565( block[0], normal2[1], block[2] ) );
  2331. EmitUShort( ColorTo565( block[0], normal1[1], block[2] ) );
  2332. EmitGreenIndices_SSE2( block, 1*8, normal1[1], normal2[1] );
  2333. }
  2334. outData += dstPadding;
  2335. inBuf += srcPadding;
  2336. }
  2337. #ifdef TEST_COMPRESSION
  2338. int tmpDstPadding = dstPadding;
  2339. dstPadding = 0;
  2340. byte * testOutBuf = (byte *) _alloca16( width * height );
  2341. CompressNormalMapDXT5Fast_Generic( inBuf, testOutBuf, width, height );
  2342. for ( int j = 0; j < height / 4; j++ ) {
  2343. for ( int i = 0; i < width / 4; i++ ) {
  2344. byte * ptr1 = outBuf + ( j * width/4 + i ) * 16 + j * tmpDstPadding;
  2345. byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 16;
  2346. for ( int k = 0; k < 16; k++ ) {
  2347. assert( ptr1[k] == ptr2[k] );
  2348. }
  2349. }
  2350. }
  2351. dstPadding = tmpDstPadding;
  2352. #endif
  2353. }
  2354. #endif