1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779 |
- /*
- ===========================================================================
- Doom 3 BFG Edition GPL Source Code
- Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
- This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
- Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>.
- In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below.
- If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
- ===========================================================================
- */
- /*
- ================================================================================================
- Contains the DxtEncoder implementation for SSE2.
- ================================================================================================
- */
- #pragma hdrstop
- #include "DXTCodec_local.h"
- #include "DXTCodec.h"
- #if defined( ID_WIN_X86_SSE2_INTRIN ) || ( ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) )
- //#define TEST_COMPRESSION
- #ifdef TEST_COMPRESSION
- #include <malloc.h>
- #endif
- #define INSET_COLOR_SHIFT 4 // inset the bounding box with ( range >> shift )
- #define INSET_ALPHA_SHIFT 5 // inset alpha channel
- #define C565_5_MASK 0xF8 // 0xFF minus last three bits
- #define C565_6_MASK 0xFC // 0xFF minus last two bits
- #define NVIDIA_7X_HARDWARE_BUG_FIX // keep the DXT5 colors sorted as: max, min
- #if !defined( R_SHUFFLE_D )
- #define R_SHUFFLE_D( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
- #endif
- typedef uint16 word;
- typedef uint32 dword;
- ALIGN16( static __m128i SIMD_SSE2_zero ) = { 0, 0, 0, 0 };
- ALIGN16( static dword SIMD_SSE2_dword_byte_mask[4] ) = { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF };
- ALIGN16( static dword SIMD_SSE2_dword_word_mask[4] ) = { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF };
- ALIGN16( static dword SIMD_SSE2_dword_red_mask[4] ) = { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF };
- ALIGN16( static dword SIMD_SSE2_dword_green_mask[4] ) = { 0x0000FF00, 0x0000FF00, 0x0000FF00, 0x0000FF00 };
- ALIGN16( static dword SIMD_SSE2_dword_blue_mask[4] ) = { 0x00FF0000, 0x00FF0000, 0x00FF0000, 0x00FF0000 };
- ALIGN16( static dword SIMD_SSE2_dword_colorMask_1010[4] ) = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 };
- ALIGN16( static dword SIMD_SSE2_dword_colorMask_0100[4] ) = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 };
- ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask0[4] ) = { 7<<0, 0, 7<<0, 0 };
- ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask1[4] ) = { 7<<3, 0, 7<<3, 0 };
- ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask2[4] ) = { 7<<6, 0, 7<<6, 0 };
- ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask3[4] ) = { 7<<9, 0, 7<<9, 0 };
- ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask4[4] ) = { 7<<12, 0, 7<<12, 0 };
- ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask5[4] ) = { 7<<15, 0, 7<<15, 0 };
- ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask6[4] ) = { 7<<18, 0, 7<<18, 0 };
- ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask7[4] ) = { 7<<21, 0, 7<<21, 0 };
- ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask0[4] ) = { 3<<0, 0, 3<<0, 0 };
- ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask1[4] ) = { 3<<2, 0, 3<<2, 0 };
- ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask2[4] ) = { 3<<4, 0, 3<<4, 0 };
- ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask3[4] ) = { 3<<6, 0, 3<<6, 0 };
- ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask4[4] ) = { 3<<8, 0, 3<<8, 0 };
- ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask5[4] ) = { 3<<10, 0, 3<<10, 0 };
- ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask6[4] ) = { 3<<12, 0, 3<<12, 0 };
- ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask7[4] ) = { 3<<14, 0, 3<<14, 0 };
- ALIGN16( static word SIMD_SSE2_word_0[8] ) = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
- ALIGN16( static word SIMD_SSE2_word_1[8] ) = { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 };
- ALIGN16( static word SIMD_SSE2_word_2[8] ) = { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 };
- ALIGN16( static word SIMD_SSE2_word_3[8] ) = { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 };
- ALIGN16( static word SIMD_SSE2_word_7[8] ) = { 0x0007, 0x0007, 0x0007, 0x0007, 0x0007, 0x0007, 0x0007, 0x0007 };
- ALIGN16( static word SIMD_SSE2_word_8[8] ) = { 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008 };
- ALIGN16( static word SIMD_SSE2_word_31[8] ) = { 31, 31, 31, 31, 31, 31, 31, 31 };
- ALIGN16( static word SIMD_SSE2_word_63[8] ) = { 63, 63, 63, 63, 63, 63, 63, 63 };
- ALIGN16( static word SIMD_SSE2_word_127[8] ) = { 127, 127, 127, 127, 127, 127, 127, 127 };
- ALIGN16( static word SIMD_SSE2_word_255[8] ) = { 255, 255, 255, 255, 255, 255, 255, 255 };
- ALIGN16( static word SIMD_SSE2_word_center_128[8] ) = { 128, 128, 0, 0, 0, 0, 0, 0 };
- ALIGN16( static word SIMD_SSE2_word_div_by_3[8] ) = { (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1 };
- ALIGN16( static word SIMD_SSE2_word_div_by_6[8] ) = { (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1 };
- ALIGN16( static word SIMD_SSE2_word_div_by_14[8] ) = { (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1 };
- ALIGN16( static word SIMD_SSE2_word_scale_7_9_11_13[8] ) = { 7, 7, 9, 9, 11, 11, 13, 13 };
- ALIGN16( static word SIMD_SSE2_word_scale_7_5_3_1[8] ) = { 7, 7, 5, 5, 3, 3, 1, 1 };
- ALIGN16( static word SIMD_SSE2_word_scale_5_3_1[8] ) = { 5, 3, 1, 0, 5, 3, 1, 0 };
- ALIGN16( static word SIMD_SSE2_word_scale_1_3_5[8] ) = { 1, 3, 5, 0, 1, 3, 5, 0 };
- ALIGN16( static word SIMD_SSE2_word_insetShift[8] ) = { 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_ALPHA_SHIFT ), 0, 0, 0, 0 };
- ALIGN16( static word SIMD_SSE2_word_insetYCoCgRound[8] ) = { ((1<<(INSET_COLOR_SHIFT-1))-1), ((1<<(INSET_COLOR_SHIFT-1))-1), ((1<<(INSET_COLOR_SHIFT-1))-1), ((1<<(INSET_ALPHA_SHIFT-1))-1), 0, 0, 0, 0 };
- ALIGN16( static word SIMD_SSE2_word_insetYCoCgMask[8] ) = { 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF };
- ALIGN16( static word SIMD_SSE2_word_insetYCoCgShiftUp[8] ) = { 1 << INSET_COLOR_SHIFT, 1 << INSET_COLOR_SHIFT, 1 << INSET_COLOR_SHIFT, 1 << INSET_ALPHA_SHIFT, 0, 0, 0, 0 };
- ALIGN16( static word SIMD_SSE2_word_insetYCoCgShiftDown[8] ) = { 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_ALPHA_SHIFT ), 0, 0, 0, 0 };
- ALIGN16( static word SIMD_SSE2_word_insetYCoCgQuantMask[8] ) = { C565_5_MASK, C565_6_MASK, C565_5_MASK, 0xFF, C565_5_MASK, C565_6_MASK, C565_5_MASK, 0xFF };
- ALIGN16( static word SIMD_SSE2_word_insetYCoCgRep[8] ) = { 1 << ( 16 - 5 ), 1 << ( 16 - 6 ), 1 << ( 16 - 5 ), 0, 1 << ( 16 - 5 ), 1 << ( 16 - 6 ), 1 << ( 16 - 5 ), 0 };
- ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5Round[8] ) = { 0, ((1<<(INSET_COLOR_SHIFT-1))-1), 0, ((1<<(INSET_ALPHA_SHIFT-1))-1), 0, 0, 0, 0 };
- ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5Mask[8] ) = { 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000 };
- ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5ShiftUp[8] ) = { 1, 1 << INSET_COLOR_SHIFT, 1, 1 << INSET_ALPHA_SHIFT, 1, 1, 1, 1 };
- ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5ShiftDown[8] ) = { 0, 1 << ( 16 - INSET_COLOR_SHIFT ), 0, 1 << ( 16 - INSET_ALPHA_SHIFT ), 0, 0, 0, 0 };
- ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5QuantMask[8] ) = { 0xFF, C565_6_MASK, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
- ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5Rep[8] ) = { 0, 1 << ( 16 - 6 ), 0, 0, 0, 0, 0, 0 };
- ALIGN16( static word SIMD_SSE2_word_insetNormal3DcRound[8] ) = { ((1<<(INSET_ALPHA_SHIFT-1))-1), ((1<<(INSET_ALPHA_SHIFT-1))-1), 0, 0, 0, 0, 0, 0 };
- ALIGN16( static word SIMD_SSE2_word_insetNormal3DcMask[8] ) = { 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
- ALIGN16( static word SIMD_SSE2_word_insetNormal3DcShiftUp[8] ) = { 1 << INSET_ALPHA_SHIFT, 1 << INSET_ALPHA_SHIFT, 1, 1, 1, 1, 1, 1 };
- ALIGN16( static word SIMD_SSE2_word_insetNormal3DcShiftDown[8] ) = { 1 << ( 16 - INSET_ALPHA_SHIFT ), 1 << ( 16 - INSET_ALPHA_SHIFT ), 0, 0, 0, 0, 0, 0 };
- ALIGN16( static byte SIMD_SSE2_byte_0[16] ) = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
- ALIGN16( static byte SIMD_SSE2_byte_1[16] ) = { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 };
- ALIGN16( static byte SIMD_SSE2_byte_2[16] ) = { 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 };
- ALIGN16( static byte SIMD_SSE2_byte_3[16] ) = { 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 };
- ALIGN16( static byte SIMD_SSE2_byte_4[16] ) = { 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 };
- ALIGN16( static byte SIMD_SSE2_byte_7[16] ) = { 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 };
- ALIGN16( static byte SIMD_SSE2_byte_8[16] ) = { 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 };
- ALIGN16( static byte SIMD_SSE2_byte_not[16] ) = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
- ALIGN16( static byte SIMD_SSE2_byte_colorMask[16] ) = { C565_5_MASK, C565_6_MASK, C565_5_MASK, 0x00, 0x00, 0x00, 0x00, 0x00, C565_5_MASK, C565_6_MASK, C565_5_MASK, 0x00, 0x00, 0x00, 0x00, 0x00 };
- ALIGN16( static byte SIMD_SSE2_byte_colorMask2[16] ) = { 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00 };
- ALIGN16( static byte SIMD_SSE2_byte_ctx1Mask[16] ) = { 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
- ALIGN16( static byte SIMD_SSE2_byte_diagonalMask[16] ) = { 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
- ALIGN16( static byte SIMD_SSE2_byte_scale_mask0[16] ) = { 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF };
- ALIGN16( static byte SIMD_SSE2_byte_scale_mask1[16] ) = { 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00 };
- ALIGN16( static byte SIMD_SSE2_byte_scale_mask2[16] ) = { 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00 };
- ALIGN16( static byte SIMD_SSE2_byte_scale_mask3[16] ) = { 0xFF, 0xFF, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00 };
- ALIGN16( static byte SIMD_SSE2_byte_scale_mask4[16] ) = { 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00 };
- ALIGN16( static byte SIMD_SSE2_byte_minus_128_0[16] ) = { (byte)-128, (byte)-128, 0, 0, (byte)-128, (byte)-128, 0, 0, (byte)-128, (byte)-128, 0, 0, (byte)-128, (byte)-128, 0, 0 };
- /*
- ========================
- idDxtEncoder::ExtractBlock_SSE2
- params: inPtr - input image, 4 bytes per pixel
- paramO: colorBlock - 4*4 output tile, 4 bytes per pixel
- ========================
- */
- ID_INLINE void idDxtEncoder::ExtractBlock_SSE2( const byte * inPtr, int width, byte * colorBlock ) const {
- #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
- __asm {
- mov esi, inPtr
- mov edi, colorBlock
- mov eax, width
- shl eax, 2
- movdqa xmm0, xmmword ptr [esi]
- movdqa xmmword ptr [edi+ 0], xmm0
- movdqa xmm1, xmmword ptr [esi+eax] // + 4 * width
- movdqa xmmword ptr [edi+16], xmm1
- movdqa xmm2, xmmword ptr [esi+eax*2] // + 8 * width
- add esi, eax
- movdqa xmmword ptr [edi+32], xmm2
- movdqa xmm3, xmmword ptr [esi+eax*2] // + 12 * width
- movdqa xmmword ptr [edi+48], xmm3
- }
- #elif defined ( ID_WIN_X86_SSE2_INTRIN )
- *((__m128i *)(&colorBlock[ 0])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 0 ) );
- *((__m128i *)(&colorBlock[16])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 1 ) );
- *((__m128i *)(&colorBlock[32])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 2 ) );
- *((__m128i *)(&colorBlock[48])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 3 ) );
- #else
- assert( false );
- #endif
- }
- /*
- ========================
- idDxtEncoder::GetMinMaxBBox_SSE2
- Takes the extents of the bounding box of the colors in the 4x4 block.
- params: colorBlock - 4*4 input tile, 4 bytes per pixel
- paramO: minColor - Min 4 byte output color
- paramO: maxColor - Max 4 byte output color
- ========================
- */
- ID_INLINE void idDxtEncoder::GetMinMaxBBox_SSE2( const byte * colorBlock, byte * minColor, byte * maxColor ) const {
- #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
- __asm {
- mov eax, colorBlock
- mov esi, minColor
- mov edi, maxColor
- movdqa xmm0, xmmword ptr [eax+ 0]
- movdqa xmm1, xmmword ptr [eax+ 0]
- pminub xmm0, xmmword ptr [eax+16]
- pmaxub xmm1, xmmword ptr [eax+16]
- pminub xmm0, xmmword ptr [eax+32]
- pmaxub xmm1, xmmword ptr [eax+32]
- pminub xmm0, xmmword ptr [eax+48]
- pmaxub xmm1, xmmword ptr [eax+48]
- pshufd xmm3, xmm0, R_SHUFFLE_D( 2, 3, 2, 3 )
- pshufd xmm4, xmm1, R_SHUFFLE_D( 2, 3, 2, 3 )
- pminub xmm0, xmm3
- pmaxub xmm1, xmm4
- pshuflw xmm6, xmm0, R_SHUFFLE_D( 2, 3, 2, 3 )
- pshuflw xmm7, xmm1, R_SHUFFLE_D( 2, 3, 2, 3 )
- pminub xmm0, xmm6
- pmaxub xmm1, xmm7
- movd dword ptr [esi], xmm0
- movd dword ptr [edi], xmm1
- }
- #elif defined ( ID_WIN_X86_SSE2_INTRIN )
- __m128i block0 = *((__m128i *)(&colorBlock[ 0]));
- __m128i block1 = *((__m128i *)(&colorBlock[16]));
- __m128i block2 = *((__m128i *)(&colorBlock[32]));
- __m128i block3 = *((__m128i *)(&colorBlock[48]));
- __m128i max1 = _mm_max_epu8( block0, block1 );
- __m128i min1 = _mm_min_epu8( block0, block1 );
- __m128i max2 = _mm_max_epu8( block2, block3 );
- __m128i min2 = _mm_min_epu8( block2, block3 );
- __m128i max3 = _mm_max_epu8( max1, max2 );
- __m128i min3 = _mm_min_epu8( min1, min2 );
- __m128i max4 = _mm_shuffle_epi32( max3, R_SHUFFLE_D( 2, 3, 2, 3 ) );
- __m128i min4 = _mm_shuffle_epi32( min3, R_SHUFFLE_D( 2, 3, 2, 3 ) );
- __m128i max5 = _mm_max_epu8( max3, max4 );
- __m128i min5 = _mm_min_epu8( min3, min4 );
- __m128i max6 = _mm_shufflelo_epi16( max5, R_SHUFFLE_D( 2, 3, 2, 3 ) );
- __m128i min6 = _mm_shufflelo_epi16( min5, R_SHUFFLE_D( 2, 3, 2, 3 ) );
- max6 = _mm_max_epu8( max5, max6 );
- min6 = _mm_min_epu8( min5, min6 );
- *((int *)maxColor) = _mm_cvtsi128_si32( max6 );
- *((int *)minColor) = _mm_cvtsi128_si32( min6 );
- #else
- assert( false );
- #endif
- }
- /*
- ========================
- idDxtEncoder::InsetColorsBBox_SSE2
- ========================
- */
- ID_INLINE void idDxtEncoder::InsetColorsBBox_SSE2( byte * minColor, byte * maxColor ) const {
- #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
- __asm {
- mov esi, minColor
- mov edi, maxColor
- movd xmm0, dword ptr [esi]
- movd xmm1, dword ptr [edi]
- punpcklbw xmm0, SIMD_SSE2_byte_0
- punpcklbw xmm1, SIMD_SSE2_byte_0
- movdqa xmm2, xmm1
- psubw xmm2, xmm0
- pmulhw xmm2, SIMD_SSE2_word_insetShift
- paddw xmm0, xmm2
- psubw xmm1, xmm2
- packuswb xmm0, xmm0
- packuswb xmm1, xmm1
- movd dword ptr [esi], xmm0
- movd dword ptr [edi], xmm1
- }
- #elif defined ( ID_WIN_X86_SSE2_INTRIN )
- __m128i min = _mm_cvtsi32_si128( *(int *)minColor );
- __m128i max = _mm_cvtsi32_si128( *(int *)maxColor );
- __m128i xmm0 = _mm_unpacklo_epi8( min, *(__m128i *)SIMD_SSE2_byte_0 );
- __m128i xmm1 = _mm_unpacklo_epi8( max, *(__m128i *)SIMD_SSE2_byte_0 );
- __m128i xmm2 = _mm_sub_epi16( xmm1, xmm0 );
- xmm2 = _mm_mulhi_epi16( xmm2, *(__m128i *)SIMD_SSE2_word_insetShift );
- xmm0 = _mm_add_epi16( xmm0, xmm2 );
- xmm1 = _mm_sub_epi16( xmm1, xmm2 );
- xmm0 = _mm_packus_epi16( xmm0, xmm0 );
- xmm1 = _mm_packus_epi16( xmm1, xmm1 );
- *((int *)minColor) = _mm_cvtsi128_si32( xmm0 );
- *((int *)maxColor) = _mm_cvtsi128_si32( xmm1 );
- #else
- assert( false );
- #endif
- }
- /*
- ========================
- idDxtEncoder::EmitColorIndices_SSE2
- params: colorBlock - 16 pixel block for which to find color indices
- paramO: minColor - Min alpha found
- paramO: maxColor - Max alpha found
- return: 4 byte color index block
- ========================
- */
- void idDxtEncoder::EmitColorIndices_SSE2( const byte * colorBlock, const byte * minColor_, const byte * maxColor_ ) {
- #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
- ALIGN16( byte color0[16] );
- ALIGN16( byte color1[16] );
- ALIGN16( byte color2[16] );
- ALIGN16( byte color3[16] );
- ALIGN16( byte result[16] );
- byte *outPtr = outData;
- __asm {
- mov esi, maxColor_
- mov edi, minColor_
- pxor xmm7, xmm7
- movdqa result, xmm7
- movd xmm0, dword ptr [esi]
- pand xmm0, SIMD_SSE2_byte_colorMask
- punpcklbw xmm0, xmm7
- pshuflw xmm4, xmm0, R_SHUFFLE_D( 0, 3, 2, 3 )
- pshuflw xmm5, xmm0, R_SHUFFLE_D( 3, 1, 3, 3 )
- psrlw xmm4, 5
- psrlw xmm5, 6
- por xmm0, xmm4
- por xmm0, xmm5
- movd xmm1, dword ptr [edi]
- pand xmm1, SIMD_SSE2_byte_colorMask
- punpcklbw xmm1, xmm7
- pshuflw xmm4, xmm1, R_SHUFFLE_D( 0, 3, 2, 3 )
- pshuflw xmm5, xmm1, R_SHUFFLE_D( 3, 1, 3, 3 )
- psrlw xmm4, 5
- psrlw xmm5, 6
- por xmm1, xmm4
- por xmm1, xmm5
- movdqa xmm2, xmm0
- packuswb xmm2, xmm7
- pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 1, 0, 1 )
- movdqa color0, xmm2
- movdqa xmm6, xmm0
- paddw xmm6, xmm0
- paddw xmm6, xmm1
- pmulhw xmm6, SIMD_SSE2_word_div_by_3 // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
- packuswb xmm6, xmm7
- pshufd xmm6, xmm6, R_SHUFFLE_D( 0, 1, 0, 1 )
- movdqa color2, xmm6
- movdqa xmm3, xmm1
- packuswb xmm3, xmm7
- pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 1, 0, 1 )
- movdqa color1, xmm3
- paddw xmm1, xmm1
- paddw xmm0, xmm1
- pmulhw xmm0, SIMD_SSE2_word_div_by_3 // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
- packuswb xmm0, xmm7
- pshufd xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 )
- movdqa color3, xmm0
- mov eax, 32
- mov esi, colorBlock
- loop1: // iterates 2 times
- movq xmm3, qword ptr [esi+eax+0]
- pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 ) // punpckldq xmm4, SIMD_SSE2_dword_0
- movq xmm5, qword ptr [esi+eax+8]
- pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 ) // punpckldq xmm5, SIMD_SSE2_dword_0
- movdqa xmm0, xmm3
- movdqa xmm6, xmm5
- psadbw xmm0, color0
- psadbw xmm6, color0
- packssdw xmm0, xmm6
- movdqa xmm1, xmm3
- movdqa xmm6, xmm5
- psadbw xmm1, color1
- psadbw xmm6, color1
- packssdw xmm1, xmm6
- movdqa xmm2, xmm3
- movdqa xmm6, xmm5
- psadbw xmm2, color2
- psadbw xmm6, color2
- packssdw xmm2, xmm6
- psadbw xmm3, color3
- psadbw xmm5, color3
- packssdw xmm3, xmm5
- movq xmm4, qword ptr [esi+eax+16]
- pshufd xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 )
- movq xmm5, qword ptr [esi+eax+24]
- pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )
- movdqa xmm6, xmm4
- movdqa xmm7, xmm5
- psadbw xmm6, color0
- psadbw xmm7, color0
- packssdw xmm6, xmm7
- packssdw xmm0, xmm6 // d1
- movdqa xmm6, xmm4
- movdqa xmm7, xmm5
- psadbw xmm6, color1
- psadbw xmm7, color1
- packssdw xmm6, xmm7
- packssdw xmm1, xmm6 // d1
- movdqa xmm6, xmm4
- movdqa xmm7, xmm5
- psadbw xmm6, color2
- psadbw xmm7, color2
- packssdw xmm6, xmm7
- packssdw xmm2, xmm6 // d2
- psadbw xmm4, color3
- psadbw xmm5, color3
- packssdw xmm4, xmm5
- packssdw xmm3, xmm4 // d3
- movdqa xmm7, result
- pslld xmm7, 16
- movdqa xmm4, xmm0
- movdqa xmm5, xmm1
- pcmpgtw xmm0, xmm3 // b0
- pcmpgtw xmm1, xmm2 // b1
- pcmpgtw xmm4, xmm2 // b2
- pcmpgtw xmm5, xmm3 // b3
- pcmpgtw xmm2, xmm3 // b4
- pand xmm4, xmm1 // x0
- pand xmm5, xmm0 // x1
- pand xmm2, xmm0 // x2
- por xmm4, xmm5
- pand xmm2, SIMD_SSE2_word_1
- pand xmm4, SIMD_SSE2_word_2
- por xmm2, xmm4
- pshufd xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 )
- punpcklwd xmm2, SIMD_SSE2_word_0
- punpcklwd xmm5, SIMD_SSE2_word_0
- pslld xmm5, 8
- por xmm7, xmm5
- por xmm7, xmm2
- movdqa result, xmm7
- sub eax, 32
- jge loop1
- mov esi, outPtr
- pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 )
- pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 )
- pshufd xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 )
- pslld xmm4, 2
- pslld xmm5, 4
- pslld xmm6, 6
- por xmm7, xmm4
- por xmm7, xmm5
- por xmm7, xmm6
- movd dword ptr [esi], xmm7
- }
- outData += 4;
- #elif defined ( ID_WIN_X86_SSE2_INTRIN )
- __m128c zero = SIMD_SSE2_zero;
- __m128c result = SIMD_SSE2_zero;
- __m128c color0, color1, color2, color3;
- __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
- __m128c minColor = _mm_cvtsi32_si128( *(int *)minColor_ );
- __m128c maxColor = _mm_cvtsi32_si128( *(int *)maxColor_ );
- __m128c blocka[2], blockb[2];
- blocka[0] = *((__m128i *)(&colorBlock[ 0]));
- blocka[1] = *((__m128i *)(&colorBlock[32]));
- blockb[0] = *((__m128i *)(&colorBlock[16]));
- blockb[1] = *((__m128i *)(&colorBlock[48]));
- temp0 = _mm_and_si128( maxColor, (const __m128i &)SIMD_SSE2_byte_colorMask );
- temp0 = _mm_unpacklo_epi8( temp0, zero );
- temp4 = _mm_shufflelo_epi16( temp0, R_SHUFFLE_D( 0, 3, 2, 3 ) );
- temp5 = _mm_shufflelo_epi16( temp0, R_SHUFFLE_D( 3, 1, 3, 3 ) );
- temp4 = _mm_srli_epi16( temp4, 5 );
- temp5 = _mm_srli_epi16( temp5, 6 );
- temp0 = _mm_or_si128( temp0, temp4 );
- temp0 = _mm_or_si128( temp0, temp5 );
- temp1 = _mm_and_si128( minColor, (const __m128i &)SIMD_SSE2_byte_colorMask );
- temp1 = _mm_unpacklo_epi8( temp1, zero );
- temp4 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 0, 3, 2, 3 ) );
- temp5 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 3, 1, 3, 3 ) );
- temp4 = _mm_srli_epi16( temp4, 5 );
- temp5 = _mm_srli_epi16( temp5, 6 );
- temp1 = _mm_or_si128( temp1, temp4 );
- temp1 = _mm_or_si128( temp1, temp5 );
- temp2 = _mm_packus_epi16( temp0, zero );
- color0 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 1, 0, 1 ) );
- temp6 = _mm_add_epi16( temp0, temp0 );
- temp6 = _mm_add_epi16( temp6, temp1 );
- temp6 = _mm_mulhi_epi16( temp6, (const __m128i &)SIMD_SSE2_word_div_by_3 ); // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
- temp6 = _mm_packus_epi16( temp6, zero );
- color2 = _mm_shuffle_epi32( temp6, R_SHUFFLE_D( 0, 1, 0, 1 ) );
- temp3 = _mm_packus_epi16( temp1, zero );
- color1 = _mm_shuffle_epi32( temp3, R_SHUFFLE_D( 0, 1, 0, 1 ) );
- temp1 = _mm_add_epi16( temp1, temp1 );
- temp0 = _mm_add_epi16( temp0, temp1 );
- temp0 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_div_by_3 ); // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
- temp0 = _mm_packus_epi16( temp0, zero );
- color3 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 0, 1, 0, 1 ) );
- for ( int i = 1; i >= 0; i-- ) {
- // Load block
- temp3 = _mm_shuffle_epi32( blocka[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
- temp5 = _mm_shuffle_ps( blocka[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
- temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
- temp0 = _mm_sad_epu8( temp3, color0 );
- temp6 = _mm_sad_epu8( temp5, color0 );
- temp0 = _mm_packs_epi32( temp0, temp6 );
- temp1 = _mm_sad_epu8( temp3, color1 );
- temp6 = _mm_sad_epu8( temp5, color1 );
- temp1 = _mm_packs_epi32( temp1, temp6 );
- temp2 = _mm_sad_epu8( temp3, color2 );
- temp6 = _mm_sad_epu8( temp5, color2 );
- temp2 = _mm_packs_epi32( temp2, temp6 );
- temp3 = _mm_sad_epu8( temp3, color3 );
- temp5 = _mm_sad_epu8( temp5, color3 );
- temp3 = _mm_packs_epi32( temp3, temp5 );
- // Load block
- temp4 = _mm_shuffle_epi32( blockb[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
- temp5 = _mm_shuffle_ps( blockb[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
- temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
- temp6 = _mm_sad_epu8( temp4, color0 );
- temp7 = _mm_sad_epu8( temp5, color0 );
- temp6 = _mm_packs_epi32( temp6, temp7 );
- temp0 = _mm_packs_epi32( temp0, temp6 ); // d0
- temp6 = _mm_sad_epu8( temp4, color1 );
- temp7 = _mm_sad_epu8( temp5, color1 );
- temp6 = _mm_packs_epi32( temp6, temp7 );
- temp1 = _mm_packs_epi32( temp1, temp6 ); // d1
- temp6 = _mm_sad_epu8( temp4, color2 );
- temp7 = _mm_sad_epu8( temp5, color2 );
- temp6 = _mm_packs_epi32( temp6, temp7 );
- temp2 = _mm_packs_epi32( temp2, temp6 ); // d2
- temp4 = _mm_sad_epu8( temp4, color3 );
- temp5 = _mm_sad_epu8( temp5, color3 );
- temp4 = _mm_packs_epi32( temp4, temp5 );
- temp3 = _mm_packs_epi32( temp3, temp4 ); // d3
- temp7 = _mm_slli_epi32( result, 16 );
- temp4 = _mm_cmpgt_epi16( temp0, temp2 ); // b2
- temp5 = _mm_cmpgt_epi16( temp1, temp3 ); // b3
- temp0 = _mm_cmpgt_epi16( temp0, temp3 ); // b0
- temp1 = _mm_cmpgt_epi16( temp1, temp2 ); // b1
- temp2 = _mm_cmpgt_epi16( temp2, temp3 ); // b4
- temp4 = _mm_and_si128( temp4, temp1 ); // x0
- temp5 = _mm_and_si128( temp5, temp0 ); // x1
- temp2 = _mm_and_si128( temp2, temp0 ); // x2
- temp4 = _mm_or_si128( temp4, temp5 );
- temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_1 );
- temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_word_2 );
- temp2 = _mm_or_si128( temp2, temp4 );
- temp5 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 2, 3, 0, 1 ) );
- temp2 = _mm_unpacklo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_0 );
- temp5 = _mm_unpacklo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_0 );
- temp5 = _mm_slli_epi32( temp5, 8 );
- temp7 = _mm_or_si128( temp7, temp5 );
- result = _mm_or_si128( temp7, temp2 );
- }
- temp4 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 1, 2, 3, 0 ) );
- temp5 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 2, 3, 0, 1 ) );
- temp6 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 3, 0, 1, 2 ) );
- temp4 = _mm_slli_epi32( temp4, 2 );
- temp5 = _mm_slli_epi32( temp5, 4 );
- temp6 = _mm_slli_epi32( temp6, 6 );
- temp7 = _mm_or_si128( result, temp4 );
- temp7 = _mm_or_si128( temp7, temp5 );
- temp7 = _mm_or_si128( temp7, temp6 );
- unsigned int out = _mm_cvtsi128_si32( temp7 );
- EmitUInt( out );
- #else
- assert( false );
- #endif
- }
- /*
- ========================
- idDxtEncoder::EmitColorAlphaIndices_SSE2
- params: colorBlock - 16 pixel block for which find color indexes
- paramO: minColor - Min color found
- paramO: maxColor - Max color found
- return: 4 byte color index block
- ========================
- */
- void idDxtEncoder::EmitColorAlphaIndices_SSE2( const byte *colorBlock, const byte *minColor_, const byte *maxColor_ ) {
- #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
- ALIGN16( byte color0[16] );
- ALIGN16( byte color1[16] );
- ALIGN16( byte color2[16] );
- ALIGN16( byte color3[16] );
- ALIGN16( byte result[16] );
- byte *outPtr = outData;
- __asm {
- mov esi, maxColor_
- mov edi, minColor_
- pxor xmm7, xmm7
- movdqa result, xmm7
- movd xmm0, dword ptr [esi]
- pand xmm0, SIMD_SSE2_byte_colorMask
- punpcklbw xmm0, xmm7
- pshuflw xmm4, xmm0, R_SHUFFLE_D( 0, 3, 2, 3 )
- pshuflw xmm5, xmm0, R_SHUFFLE_D( 3, 1, 3, 3 )
- psrlw xmm4, 5
- psrlw xmm5, 6
- por xmm0, xmm4
- por xmm0, xmm5
- movd xmm1, dword ptr [edi]
- pand xmm1, SIMD_SSE2_byte_colorMask
- punpcklbw xmm1, xmm7
- pshuflw xmm4, xmm1, R_SHUFFLE_D( 0, 3, 2, 3 )
- pshuflw xmm5, xmm1, R_SHUFFLE_D( 3, 1, 3, 3 )
- psrlw xmm4, 5
- psrlw xmm5, 6
- por xmm1, xmm4
- por xmm1, xmm5
- movdqa xmm2, xmm0
- packuswb xmm2, xmm7
- pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 1, 0, 1 )
- movdqa color0, xmm2
- movdqa xmm6, xmm0
- paddw xmm6, xmm1
- psrlw xmm6, 1
- packuswb xmm6, xmm7
- pshufd xmm6, xmm6, R_SHUFFLE_D( 0, 1, 0, 1 )
- movdqa color2, xmm6
- movdqa xmm3, xmm1
- packuswb xmm3, xmm7
- pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 1, 0, 1 )
- movdqa color1, xmm3
- movdqa color3, xmm7
- mov eax, 32
- mov esi, colorBlock
- loop1: // iterates 2 times
- movq xmm3, qword ptr [esi+eax+0]
- pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 )
- movq xmm5, qword ptr [esi+eax+8]
- pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )
- movdqa xmm0, xmm3
- movdqa xmm6, xmm5
- psadbw xmm0, color0
- psadbw xmm6, color0
- packssdw xmm0, xmm6
- movdqa xmm1, xmm3
- movdqa xmm6, xmm5
- psadbw xmm1, color1
- psadbw xmm6, color1
- packssdw xmm1, xmm6
- movdqa xmm2, xmm3
- movdqa xmm6, xmm5
- psadbw xmm2, color2
- psadbw xmm6, color2
- packssdw xmm2, xmm6
- shufps xmm3, xmm5, R_SHUFFLE_D( 0, 2, 0, 2 )
- psrld xmm3, 24
- packssdw xmm3, xmm3
- movq xmm4, qword ptr [esi+eax+16]
- pshufd xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 )
- movq xmm5, qword ptr [esi+eax+24]
- pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )
- movdqa xmm6, xmm4
- movdqa xmm7, xmm5
- psadbw xmm6, color0
- psadbw xmm7, color0
- packssdw xmm6, xmm7
- packssdw xmm0, xmm6 // d1
- movdqa xmm6, xmm4
- movdqa xmm7, xmm5
- psadbw xmm6, color1
- psadbw xmm7, color1
- packssdw xmm6, xmm7
- packssdw xmm1, xmm6 // d1
- movdqa xmm6, xmm4
- movdqa xmm7, xmm5
- psadbw xmm6, color2
- psadbw xmm7, color2
- packssdw xmm6, xmm7
- packssdw xmm2, xmm6 // d2
- shufps xmm4, xmm5, R_SHUFFLE_D( 0, 2, 0, 2 )
- psrld xmm4, 24
- packssdw xmm4, xmm4
- punpcklqdq xmm3, xmm4 // c3
- movdqa xmm7, result
- pslld xmm7, 16
- movdqa xmm4, xmm2
- pcmpgtw xmm2, xmm0 // b0
- pcmpgtw xmm4, xmm1 // b1
- pcmpgtw xmm1, xmm0 // b2
- pmaxsw xmm3, SIMD_SSE2_word_127 // b3
- pcmpeqw xmm3, SIMD_SSE2_word_127
- pand xmm2, xmm4
- por xmm2, xmm3 // b0 & b1 | b3
- pxor xmm1, xmm4
- por xmm1, xmm3 // b2 ^ b1 | b3
- pand xmm2, SIMD_SSE2_word_2
- pand xmm1, SIMD_SSE2_word_1
- por xmm2, xmm1
- pshufd xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 )
- punpcklwd xmm2, SIMD_SSE2_word_0
- punpcklwd xmm5, SIMD_SSE2_word_0
- pslld xmm5, 8
- por xmm7, xmm5
- por xmm7, xmm2
- movdqa result, xmm7
- sub eax, 32
- jge loop1
- mov esi, outPtr
- pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 )
- pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 )
- pshufd xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 )
- pslld xmm4, 2
- pslld xmm5, 4
- pslld xmm6, 6
- por xmm7, xmm4
- por xmm7, xmm5
- por xmm7, xmm6
- movd dword ptr [esi], xmm7
- }
- outData += 4;
- #elif defined ( ID_WIN_X86_SSE2_INTRIN )
- __m128c zero = SIMD_SSE2_zero;
- __m128c result = SIMD_SSE2_zero;
- __m128c color0, color1, color2;
- __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
- __m128c minColor = _mm_cvtsi32_si128( *(int *)minColor_ );
- __m128c maxColor = _mm_cvtsi32_si128( *(int *)maxColor_ );
- __m128c blocka[2], blockb[2];
- blocka[0] = *((__m128i *)(&colorBlock[ 0]));
- blocka[1] = *((__m128i *)(&colorBlock[32]));
- blockb[0] = *((__m128i *)(&colorBlock[16]));
- blockb[1] = *((__m128i *)(&colorBlock[48]));
- temp0 = _mm_and_si128( maxColor, *(__m128c*)SIMD_SSE2_byte_colorMask );
- temp0 = _mm_unpacklo_epi8( temp0, zero );
- temp4 = _mm_shufflelo_epi16( temp0, R_SHUFFLE_D( 0, 3, 2, 3 ) );
- temp5 = _mm_shufflelo_epi16( temp0, R_SHUFFLE_D( 3, 1, 3, 3 ) );
- temp4 = _mm_srli_epi16( temp4, 5 );
- temp5 = _mm_srli_epi16( temp5, 6 );
- temp0 = _mm_or_si128( temp0, temp4 );
- temp0 = _mm_or_si128( temp0, temp5 );
- temp1 = _mm_and_si128( minColor, *(__m128c*)SIMD_SSE2_byte_colorMask );
- temp1 = _mm_unpacklo_epi8( temp1, zero );
- temp4 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 0, 3, 2, 3 ) );
- temp5 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 3, 1, 3, 3 ) );
- temp4 = _mm_srli_epi16( temp4, 5 );
- temp5 = _mm_srli_epi16( temp5, 6 );
- temp1 = _mm_or_si128( temp1, temp4 );
- temp1 = _mm_or_si128( temp1, temp5 );
- temp2 = _mm_packus_epi16( temp0, zero );
- color0 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 1, 0, 1 ) );
- temp6 = _mm_add_epi16( temp0, temp0 );
- temp6 = _mm_srli_epi16( temp6, 1 ); // diff from color
- temp6 = _mm_packus_epi16( temp6, zero );
- color2 = _mm_shuffle_epi32( temp6, R_SHUFFLE_D( 0, 1, 0, 1 ) );
- temp3 = _mm_packus_epi16( temp1, zero );
- color1 = _mm_shuffle_epi32( temp3, R_SHUFFLE_D( 0, 1, 0, 1 ) );
- // not used
- //color3 = zero;
- for ( int i = 1; i >= 0; i-- ) {
- // Load block
- temp3 = _mm_shuffle_epi32( blocka[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
- temp5 = _mm_shuffle_ps( blocka[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
- temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
- temp0 = _mm_sad_epu8( temp3, color0 );
- temp6 = _mm_sad_epu8( temp5, color0 );
- temp0 = _mm_packs_epi32( temp0, temp6 );
- temp1 = _mm_sad_epu8( temp3, color1 );
- temp6 = _mm_sad_epu8( temp5, color1 );
- temp1 = _mm_packs_epi32( temp1, temp6 );
- temp2 = _mm_sad_epu8( temp3, color2 );
- temp6 = _mm_sad_epu8( temp5, color2 );
- temp2 = _mm_packs_epi32( temp2, temp6 );
- // diff from color
- temp3 = _mm_shuffle_ps( temp3, temp5, R_SHUFFLE_D( 0, 2, 0, 2 ) );
- temp3 = _mm_srli_epi32( temp3, 24 );
- temp3 = _mm_packs_epi32( temp3, temp3 );
- // Load block
- temp4 = _mm_shuffle_epi32( blockb[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
- temp5 = _mm_shuffle_ps( blockb[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
- temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
- temp6 = _mm_sad_epu8( temp4, color0 );
- temp7 = _mm_sad_epu8( temp5, color0 );
- temp6 = _mm_packs_epi32( temp6, temp7 );
- temp0 = _mm_packs_epi32( temp0, temp6 ); // d0
- temp6 = _mm_sad_epu8( temp4, color1 );
- temp7 = _mm_sad_epu8( temp5, color1 );
- temp6 = _mm_packs_epi32( temp6, temp7 );
- temp1 = _mm_packs_epi32( temp1, temp6 ); // d1
- temp6 = _mm_sad_epu8( temp4, color2 );
- temp7 = _mm_sad_epu8( temp5, color2 );
- temp6 = _mm_packs_epi32( temp6, temp7 );
- temp2 = _mm_packs_epi32( temp2, temp6 ); // d2
- // diff from color
- temp4 = _mm_shuffle_ps( temp4, temp5, R_SHUFFLE_D( 0, 2, 0, 2 ) ); // c3
- temp4 = _mm_srli_epi32( temp4, 24 );
- temp4 = _mm_packs_epi32( temp4, temp4 );
- temp3 = _mm_unpacklo_epi64( temp3, temp4 );
- temp7 = _mm_slli_epi32( result, 16 );
- // diff from color
- temp4 = _mm_cmpgt_epi16( temp2, temp1 ); // b1
- temp2 = _mm_cmpgt_epi16( temp2, temp0 ); // b0
- temp1 = _mm_cmpgt_epi16( temp1, temp0 ); // b2
- temp3 = _mm_max_epi16( temp3, (const __m128i &)SIMD_SSE2_word_127 ); // b3
- temp3 = _mm_cmpeq_epi16( temp3, (const __m128i &)SIMD_SSE2_word_127 );
- temp2 = _mm_and_si128( temp2, temp4 );
- temp2 = _mm_or_si128( temp2, temp3 ); // b0 & b1 | b3
- temp1 = _mm_xor_si128( temp1, temp4 );
- temp1 = _mm_or_si128( temp1, temp3 ); // b2 ^ b1 | b3
- temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_2 );
- temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_word_1 );
- temp2 = _mm_or_si128( temp2, temp1 );
- temp5 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 2, 3, 0, 1 ) );
- temp2 = _mm_unpacklo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_0 );
- temp5 = _mm_unpacklo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_0 );
- temp5 = _mm_slli_epi32( temp5, 8 );
- temp7 = _mm_or_si128( temp7, temp5 );
- result = _mm_or_si128( temp7, temp2 );
- }
- temp4 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 1, 2, 3, 0 ) );
- temp5 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 2, 3, 0, 1 ) );
- temp6 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 3, 0, 1, 2 ) );
- temp4 = _mm_slli_epi32( temp4, 2 );
- temp5 = _mm_slli_epi32( temp5, 4 );
- temp6 = _mm_slli_epi32( temp6, 6 );
- temp7 = _mm_or_si128( result, temp4 );
- temp7 = _mm_or_si128( temp7, temp5 );
- temp7 = _mm_or_si128( temp7, temp6 );
- unsigned int out = _mm_cvtsi128_si32( temp7 );
- EmitUInt( out );
- #else
- assert( false );
- #endif
- }
- /*
- ========================
- idDxtEncoder::EmitCoCgIndices_SSE2
- params: colorBlock - 16 pixel block for which to find color indices
- paramO: minColor - Min alpha found
- paramO: maxColor - Max alpha found
- return: 4 byte color index block
- ========================
- */
- void idDxtEncoder::EmitCoCgIndices_SSE2( const byte *colorBlock, const byte *minColor_, const byte *maxColor_ ) {
- #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
- ALIGN16( byte color0[16] );
- ALIGN16( byte color1[16] );
- ALIGN16( byte color2[16] );
- ALIGN16( byte color3[16] );
- ALIGN16( byte result[16] );
- byte *outPtr = outData;
- __asm {
- mov esi, maxColor_
- mov edi, minColor_
- pxor xmm7, xmm7
- movdqa result, xmm7
- movd xmm0, dword ptr [esi]
- pand xmm0, SIMD_SSE2_byte_colorMask2
- pshufd xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 )
- movdqa color0, xmm0
- movd xmm1, dword ptr [edi]
- pand xmm1, SIMD_SSE2_byte_colorMask2
- pshufd xmm1, xmm1, R_SHUFFLE_D( 0, 1, 0, 1 )
- movdqa color1, xmm1
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
- movdqa xmm6, xmm1
- paddw xmm1, xmm0
- paddw xmm0, xmm1
- pmulhw xmm0, SIMD_SSE2_word_div_by_3 // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
- packuswb xmm0, xmm7
- pshufd xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 )
- movdqa color2, xmm0
- paddw xmm1, xmm6
- pmulhw xmm1, SIMD_SSE2_word_div_by_3 // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
- packuswb xmm1, xmm7
- pshufd xmm1, xmm1, R_SHUFFLE_D( 0, 1, 0, 1 )
- movdqa color3, xmm1
- mov eax, 32
- mov esi, colorBlock
- loop1: // iterates 2 times
- movq xmm3, qword ptr [esi+eax+0]
- pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 ) // punpckldq xmm4, SIMD_SSE2_dword_0
- movq xmm5, qword ptr [esi+eax+8]
- pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 ) // punpckldq xmm5, SIMD_SSE2_dword_0
- movdqa xmm0, xmm3
- movdqa xmm6, xmm5
- psadbw xmm0, color0
- psadbw xmm6, color0
- packssdw xmm0, xmm6
- movdqa xmm1, xmm3
- movdqa xmm6, xmm5
- psadbw xmm1, color1
- psadbw xmm6, color1
- packssdw xmm1, xmm6
- movdqa xmm2, xmm3
- movdqa xmm6, xmm5
- psadbw xmm2, color2
- psadbw xmm6, color2
- packssdw xmm2, xmm6
- psadbw xmm3, color3
- psadbw xmm5, color3
- packssdw xmm3, xmm5
- movq xmm4, qword ptr [esi+eax+16]
- pshufd xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 )
- movq xmm5, qword ptr [esi+eax+24]
- pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )
- movdqa xmm6, xmm4
- movdqa xmm7, xmm5
- psadbw xmm6, color0
- psadbw xmm7, color0
- packssdw xmm6, xmm7
- packssdw xmm0, xmm6 // d1
- movdqa xmm6, xmm4
- movdqa xmm7, xmm5
- psadbw xmm6, color1
- psadbw xmm7, color1
- packssdw xmm6, xmm7
- packssdw xmm1, xmm6 // d1
- movdqa xmm6, xmm4
- movdqa xmm7, xmm5
- psadbw xmm6, color2
- psadbw xmm7, color2
- packssdw xmm6, xmm7
- packssdw xmm2, xmm6 // d2
- psadbw xmm4, color3
- psadbw xmm5, color3
- packssdw xmm4, xmm5
- packssdw xmm3, xmm4 // d3
- movdqa xmm7, result
- pslld xmm7, 16
- movdqa xmm4, xmm0
- movdqa xmm5, xmm1
- pcmpgtw xmm0, xmm3 // b0
- pcmpgtw xmm1, xmm2 // b1
- pcmpgtw xmm4, xmm2 // b2
- pcmpgtw xmm5, xmm3 // b3
- pcmpgtw xmm2, xmm3 // b4
- pand xmm4, xmm1 // x0
- pand xmm5, xmm0 // x1
- pand xmm2, xmm0 // x2
- por xmm4, xmm5
- pand xmm2, SIMD_SSE2_word_1
- pand xmm4, SIMD_SSE2_word_2
- por xmm2, xmm4
- pshufd xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 )
- punpcklwd xmm2, SIMD_SSE2_word_0
- punpcklwd xmm5, SIMD_SSE2_word_0
- pslld xmm5, 8
- por xmm7, xmm5
- por xmm7, xmm2
- movdqa result, xmm7
- sub eax, 32
- jge loop1
- mov esi, outPtr
- pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 )
- pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 )
- pshufd xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 )
- pslld xmm4, 2
- pslld xmm5, 4
- pslld xmm6, 6
- por xmm7, xmm4
- por xmm7, xmm5
- por xmm7, xmm6
- movd dword ptr [esi], xmm7
- }
- outData += 4;
- #elif defined ( ID_WIN_X86_SSE2_INTRIN )
- __m128c zero = SIMD_SSE2_zero;
- __m128c result = SIMD_SSE2_zero;
- __m128c color0, color1, color2, color3;
- __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
- __m128c minColor = _mm_cvtsi32_si128( *(int *)minColor_ );
- __m128c maxColor = _mm_cvtsi32_si128( *(int *)maxColor_ );
- __m128c blocka[2], blockb[2];
- blocka[0] = *((__m128i *)(&colorBlock[ 0]));
- blocka[1] = *((__m128i *)(&colorBlock[32]));
- blockb[0] = *((__m128i *)(&colorBlock[16]));
- blockb[1] = *((__m128i *)(&colorBlock[48]));
- temp7 = zero;
- temp0 = maxColor;
- temp0 = _mm_and_si128( temp0, *(__m128c*)SIMD_SSE2_byte_colorMask2 );
- color0 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 0, 1, 0, 1 ) );
- temp1 = minColor;
- temp1 = _mm_and_si128( temp1, *(__m128c*)SIMD_SSE2_byte_colorMask2 );
- color1 = _mm_shuffle_epi32( temp1, R_SHUFFLE_D( 0, 1, 0, 1 ) );
- temp0 = _mm_unpacklo_epi8( color0, zero );
- temp1 = _mm_unpacklo_epi8( color1, zero );
- temp6 = _mm_add_epi16( temp1, temp0 );
- temp0 = _mm_add_epi16( temp0, temp6 );
- temp0 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_div_by_3 ); // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
- temp0 = _mm_packus_epi16( temp0, zero );
- color2 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 0, 1, 0, 1 ) );
- temp1 = _mm_add_epi16( temp1, temp6 );
- temp1 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_div_by_3 ); // * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
- temp1 = _mm_packus_epi16( temp1, zero );
- color3 = _mm_shuffle_epi32( temp1, R_SHUFFLE_D( 0, 1, 0, 1 ) );
- for ( int i = 1; i >= 0; i-- ) {
- // Load block
- temp3 = _mm_shuffle_epi32( blocka[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
- temp5 = _mm_shuffle_ps( blocka[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
- temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
- temp0 = _mm_sad_epu8( temp3, color0 );
- temp6 = _mm_sad_epu8( temp5, color0 );
- temp0 = _mm_packs_epi32( temp0, temp6 );
- temp1 = _mm_sad_epu8( temp3, color1 );
- temp6 = _mm_sad_epu8( temp5, color1 );
- temp1 = _mm_packs_epi32( temp1, temp6 );
- temp2 = _mm_sad_epu8( temp3, color2 );
- temp6 = _mm_sad_epu8( temp5, color2 );
- temp2 = _mm_packs_epi32( temp2, temp6 );
- temp3 = _mm_sad_epu8( temp3, color3 );
- temp5 = _mm_sad_epu8( temp5, color3 );
- temp3 = _mm_packs_epi32( temp3, temp5 );
- // Load block
- temp4 = _mm_shuffle_epi32( blockb[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
- temp5 = _mm_shuffle_ps( blockb[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
- temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );
- temp6 = _mm_sad_epu8( temp4, color0 );
- temp7 = _mm_sad_epu8( temp5, color0 );
- temp6 = _mm_packs_epi32( temp6, temp7 );
- temp0 = _mm_packs_epi32( temp0, temp6 ); // d0
- temp6 = _mm_sad_epu8( temp4, color1 );
- temp7 = _mm_sad_epu8( temp5, color1 );
- temp6 = _mm_packs_epi32( temp6, temp7 );
- temp1 = _mm_packs_epi32( temp1, temp6 ); // d1
- temp6 = _mm_sad_epu8( temp4, color2 );
- temp7 = _mm_sad_epu8( temp5, color2 );
- temp6 = _mm_packs_epi32( temp6, temp7 );
- temp2 = _mm_packs_epi32( temp2, temp6 ); // d2
- temp4 = _mm_sad_epu8( temp4, color3 );
- temp5 = _mm_sad_epu8( temp5, color3 );
- temp4 = _mm_packs_epi32( temp4, temp5 );
- temp3 = _mm_packs_epi32( temp3, temp4 ); // d3
- temp7 = _mm_slli_epi32( result, 16 );
- temp4 = _mm_cmpgt_epi16( temp0, temp2 ); // b2
- temp5 = _mm_cmpgt_epi16( temp1, temp3 ); // b3
- temp0 = _mm_cmpgt_epi16( temp0, temp3 ); // b0
- temp1 = _mm_cmpgt_epi16( temp1, temp2 ); // b1
- temp2 = _mm_cmpgt_epi16( temp2, temp3 ); // b4
- temp4 = _mm_and_si128( temp4, temp1 ); // x0
- temp5 = _mm_and_si128( temp5, temp0 ); // x1
- temp2 = _mm_and_si128( temp2, temp0 ); // x2
- temp4 = _mm_or_si128( temp4, temp5 );
- temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_1 );
- temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_word_2 );
- temp2 = _mm_or_si128( temp2, temp4 );
- temp5 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 2, 3, 0, 1 ) );
- temp2 = _mm_unpacklo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_0 );
- temp5 = _mm_unpacklo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_0 );
- temp5 = _mm_slli_epi32( temp5, 8 );
- temp7 = _mm_or_si128( temp7, temp5 );
- result = _mm_or_si128( temp7, temp2 );
- }
- temp4 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 1, 2, 3, 0 ) );
- temp5 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 2, 3, 0, 1 ) );
- temp6 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 3, 0, 1, 2 ) );
- temp4 = _mm_slli_epi32( temp4, 2 );
- temp5 = _mm_slli_epi32( temp5, 4 );
- temp6 = _mm_slli_epi32( temp6, 6 );
- temp7 = _mm_or_si128( result, temp4 );
- temp7 = _mm_or_si128( temp7, temp5 );
- temp7 = _mm_or_si128( temp7, temp6 );
- unsigned int out = _mm_cvtsi128_si32( temp7 );
- EmitUInt( out );
- #else
- assert( false );
- #endif
- }
- /*
- ========================
- idDxtEncoder::EmitAlphaIndices_SSE2
- params: block - 16 pixel block for which to find alpha indices
- paramO: minAlpha - Min alpha found
- paramO: maxAlpha - Max alpha found
- ========================
- */
- void idDxtEncoder::EmitAlphaIndices_SSE2( const byte *block, const int minAlpha_, const int maxAlpha_ ) {
- #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
- assert( maxAlpha_ >= minAlpha_ );
- byte *outPtr = outData;
- __asm {
- mov esi, block
- movdqa xmm0, xmmword ptr [esi+ 0]
- movdqa xmm5, xmmword ptr [esi+16]
- movdqa xmm6, xmmword ptr [esi+32]
- movdqa xmm4, xmmword ptr [esi+48]
- psrld xmm0, 24
- psrld xmm5, 24
- psrld xmm6, 24
- psrld xmm4, 24
- packuswb xmm0, xmm5
- packuswb xmm6, xmm4
- //---------------------
- // ab0 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
- // ab3 = ( 9 * maxAlpha + 5 * minAlpha + ALPHA_RANGE ) / 14
- // ab2 = ( 11 * maxAlpha + 3 * minAlpha + ALPHA_RANGE ) / 14
- // ab1 = ( 13 * maxAlpha + 1 * minAlpha + ALPHA_RANGE ) / 14
- // ab4 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
- // ab5 = ( 5 * maxAlpha + 9 * minAlpha + ALPHA_RANGE ) / 14
- // ab6 = ( 3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
- // ab7 = ( 1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14
- movd xmm5, maxAlpha_
- pshuflw xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
- pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
- movdqa xmm7, xmm5
- movd xmm2, minAlpha_
- pshuflw xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
- pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
- movdqa xmm3, xmm2
- pmullw xmm5, SIMD_SSE2_word_scale_7_9_11_13
- pmullw xmm7, SIMD_SSE2_word_scale_7_5_3_1
- pmullw xmm2, SIMD_SSE2_word_scale_7_5_3_1
- pmullw xmm3, SIMD_SSE2_word_scale_7_9_11_13
- paddw xmm5, xmm2
- paddw xmm7, xmm3
- paddw xmm5, SIMD_SSE2_word_7
- paddw xmm7, SIMD_SSE2_word_7
- pmulhw xmm5, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16
- pmulhw xmm7, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16
- pshufd xmm1, xmm5, R_SHUFFLE_D( 3, 3, 3, 3 )
- pshufd xmm2, xmm5, R_SHUFFLE_D( 2, 2, 2, 2 )
- pshufd xmm3, xmm5, R_SHUFFLE_D( 1, 1, 1, 1 )
- packuswb xmm1, xmm1 // ab1
- packuswb xmm2, xmm2 // ab2
- packuswb xmm3, xmm3 // ab3
- packuswb xmm0, xmm6 // alpha block
- pshufd xmm4, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
- pshufd xmm5, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
- pshufd xmm6, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
- pshufd xmm7, xmm7, R_SHUFFLE_D( 3, 3, 3, 3 )
- packuswb xmm4, xmm4 // ab4
- packuswb xmm5, xmm5 // ab5
- packuswb xmm6, xmm6 // ab6
- packuswb xmm7, xmm7 // ab7
- pmaxub xmm1, xmm0
- pmaxub xmm2, xmm0
- pmaxub xmm3, xmm0
- pcmpeqb xmm1, xmm0
- pcmpeqb xmm2, xmm0
- pcmpeqb xmm3, xmm0
- pmaxub xmm4, xmm0
- pmaxub xmm5, xmm0
- pmaxub xmm6, xmm0
- pmaxub xmm7, xmm0
- pcmpeqb xmm4, xmm0
- pcmpeqb xmm5, xmm0
- pcmpeqb xmm6, xmm0
- pcmpeqb xmm7, xmm0
- movdqa xmm0, SIMD_SSE2_byte_8
- paddsb xmm0, xmm1
- paddsb xmm2, xmm3
- paddsb xmm4, xmm5
- paddsb xmm6, xmm7
- paddsb xmm0, xmm2
- paddsb xmm4, xmm6
- paddsb xmm0, xmm4
- pand xmm0, SIMD_SSE2_byte_7
- movdqa xmm1, SIMD_SSE2_byte_2
- pcmpgtb xmm1, xmm0
- pand xmm1, SIMD_SSE2_byte_1
- pxor xmm0, xmm1
- movdqa xmm1, xmm0
- movdqa xmm2, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
- movdqa xmm5, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- psrlq xmm1, 8- 3
- psrlq xmm2, 16- 6
- psrlq xmm3, 24- 9
- psrlq xmm4, 32-12
- psrlq xmm5, 40-15
- psrlq xmm6, 48-18
- psrlq xmm7, 56-21
- pand xmm0, SIMD_SSE2_dword_alpha_bit_mask0
- pand xmm1, SIMD_SSE2_dword_alpha_bit_mask1
- pand xmm2, SIMD_SSE2_dword_alpha_bit_mask2
- pand xmm3, SIMD_SSE2_dword_alpha_bit_mask3
- pand xmm4, SIMD_SSE2_dword_alpha_bit_mask4
- pand xmm5, SIMD_SSE2_dword_alpha_bit_mask5
- pand xmm6, SIMD_SSE2_dword_alpha_bit_mask6
- pand xmm7, SIMD_SSE2_dword_alpha_bit_mask7
- por xmm0, xmm1
- por xmm2, xmm3
- por xmm4, xmm5
- por xmm6, xmm7
- por xmm0, xmm2
- por xmm4, xmm6
- por xmm0, xmm4
- mov esi, outPtr
- movd [esi+0], xmm0
- pshufd xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )
- movd [esi+3], xmm1
- }
- outData += 6;
- #elif defined ( ID_WIN_X86_SSE2_INTRIN )
- __m128i block0 = *((__m128i *)(&block[ 0]));
- __m128i block1 = *((__m128i *)(&block[16]));
- __m128i block2 = *((__m128i *)(&block[32]));
- __m128i block3 = *((__m128i *)(&block[48]));
- __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
- temp0 = _mm_srli_epi32( block0, 24 );
- temp5 = _mm_srli_epi32( block1, 24 );
- temp6 = _mm_srli_epi32( block2, 24 );
- temp4 = _mm_srli_epi32( block3, 24 );
- temp0 = _mm_packus_epi16( temp0, temp5 );
- temp6 = _mm_packus_epi16( temp6, temp4 );
- //---------------------
- // ab0 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
- // ab3 = ( 9 * maxAlpha + 5 * minAlpha + ALPHA_RANGE ) / 14
- // ab2 = ( 11 * maxAlpha + 3 * minAlpha + ALPHA_RANGE ) / 14
- // ab1 = ( 13 * maxAlpha + 1 * minAlpha + ALPHA_RANGE ) / 14
- // ab4 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
- // ab5 = ( 5 * maxAlpha + 9 * minAlpha + ALPHA_RANGE ) / 14
- // ab6 = ( 3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
- // ab7 = ( 1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14
- temp5 = _mm_cvtsi32_si128( maxAlpha_ );
- temp5 = _mm_shufflelo_epi16( temp5, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp2 = _mm_cvtsi32_si128( minAlpha_ );
- temp2 = _mm_shufflelo_epi16( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp2 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp7 = _mm_mullo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_scale_7_5_3_1 );
- temp5 = _mm_mullo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_scale_7_9_11_13 );
- temp3 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_7_9_11_13 );
- temp2 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_7_5_3_1 );
- temp5 = _mm_add_epi16( temp5, temp2 );
- temp7 = _mm_add_epi16( temp7, temp3 );
- temp5 = _mm_add_epi16( temp5, (const __m128i &)SIMD_SSE2_word_7 );
- temp7 = _mm_add_epi16( temp7, (const __m128i &)SIMD_SSE2_word_7 );
- temp5 = _mm_mulhi_epi16( temp5, (const __m128i &)SIMD_SSE2_word_div_by_14 );
- temp7 = _mm_mulhi_epi16( temp7, (const __m128i &)SIMD_SSE2_word_div_by_14 );
- temp1 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 3, 3, 3, 3 ) );
- temp2 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 2, 2, 2, 2 ) );
- temp3 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 1, 1, 1, 1 ) );
- temp1 = _mm_packus_epi16( temp1, temp1 );
- temp2 = _mm_packus_epi16( temp2, temp2 );
- temp3 = _mm_packus_epi16( temp3, temp3 );
- temp0 = _mm_packus_epi16( temp0, temp6 );
- temp4 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp5 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 1, 1, 1, 1 ) );
- temp6 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 2, 2, 2, 2 ) );
- temp7 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 3, 3, 3, 3 ) );
- temp4 = _mm_packus_epi16( temp4, temp4 );
- temp5 = _mm_packus_epi16( temp5, temp5 );
- temp6 = _mm_packus_epi16( temp6, temp6 );
- temp7 = _mm_packus_epi16( temp7, temp7 );
- temp1 = _mm_max_epu8( temp1, temp0 );
- temp2 = _mm_max_epu8( temp2, temp0 );
- temp3 = _mm_max_epu8( temp3, temp0 );
- temp1 = _mm_cmpeq_epi8( temp1, temp0 );
- temp2 = _mm_cmpeq_epi8( temp2, temp0 );
- temp3 = _mm_cmpeq_epi8( temp3, temp0 );
- temp4 = _mm_max_epu8( temp4, temp0 );
- temp5 = _mm_max_epu8( temp5, temp0 );
- temp6 = _mm_max_epu8( temp6, temp0 );
- temp7 = _mm_max_epu8( temp7, temp0 );
- temp4 = _mm_cmpeq_epi8( temp4, temp0 );
- temp5 = _mm_cmpeq_epi8( temp5, temp0 );
- temp6 = _mm_cmpeq_epi8( temp6, temp0 );
- temp7 = _mm_cmpeq_epi8( temp7, temp0 );
- temp0 = _mm_adds_epi8( (const __m128i &)SIMD_SSE2_byte_8, temp1 );
- temp2 = _mm_adds_epi8( temp2, temp3 );
- temp4 = _mm_adds_epi8( temp4, temp5 );
- temp6 = _mm_adds_epi8( temp6, temp7 );
- temp0 = _mm_adds_epi8( temp0, temp2 );
- temp4 = _mm_adds_epi8( temp4, temp6 );
- temp0 = _mm_adds_epi8( temp0, temp4 );
- temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_byte_7 );
- temp1 = _mm_cmpgt_epi8( (const __m128i &)SIMD_SSE2_byte_2, temp0 );
- temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_byte_1 );
- temp0 = _mm_xor_si128( temp0, temp1 );
- temp1 = _mm_srli_epi64( temp0, 8 - 3 );
- temp2 = _mm_srli_epi64( temp0, 16 - 6 );
- temp3 = _mm_srli_epi64( temp0, 24 - 9 );
- temp4 = _mm_srli_epi64( temp0, 32 - 12 );
- temp5 = _mm_srli_epi64( temp0, 40 - 15 );
- temp6 = _mm_srli_epi64( temp0, 48 - 18 );
- temp7 = _mm_srli_epi64( temp0, 56 - 21 );
- temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask0 );
- temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask1 );
- temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask2 );
- temp3 = _mm_and_si128( temp3, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask3 );
- temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask4 );
- temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask5 );
- temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask6 );
- temp7 = _mm_and_si128( temp7, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask7 );
- temp0 = _mm_or_si128( temp0, temp1 );
- temp2 = _mm_or_si128( temp2, temp3 );
- temp4 = _mm_or_si128( temp4, temp5 );
- temp6 = _mm_or_si128( temp6, temp7 );
- temp0 = _mm_or_si128( temp0, temp2 );
- temp4 = _mm_or_si128( temp4, temp6 );
- temp0 = _mm_or_si128( temp0, temp4 );
- int out = _mm_cvtsi128_si32( temp0 );
- EmitUInt( out );
- outData--;
- temp1 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 2, 3, 0, 1 ) );
- out = _mm_cvtsi128_si32( temp1 );
- EmitUInt( out );
- outData--;
- #else
- assert( false );
- #endif
- }
- /*
- ========================
- idDxtEncoder::EmitAlphaIndices_SSE2
- ========================
- */
- void idDxtEncoder::EmitAlphaIndices_SSE2( const byte *block, const int channelBitOffset, const int minAlpha_, const int maxAlpha_ ) {
- #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
- assert( maxAlpha_ >= minAlpha_ );
- byte *outPtr = outData;
- __asm {
- movd xmm7, channelBitOffset
- mov esi, block
- movdqa xmm0, xmmword ptr [esi+ 0]
- movdqa xmm5, xmmword ptr [esi+16]
- movdqa xmm6, xmmword ptr [esi+32]
- movdqa xmm4, xmmword ptr [esi+48]
- psrld xmm0, xmm7
- psrld xmm5, xmm7
- psrld xmm6, xmm7
- psrld xmm4, xmm7
- pand xmm0, SIMD_SSE2_dword_byte_mask
- pand xmm5, SIMD_SSE2_dword_byte_mask
- pand xmm6, SIMD_SSE2_dword_byte_mask
- pand xmm4, SIMD_SSE2_dword_byte_mask
- packuswb xmm0, xmm5
- packuswb xmm6, xmm4
- //---------------------
- // ab0 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
- // ab3 = ( 9 * maxAlpha + 5 * minAlpha + ALPHA_RANGE ) / 14
- // ab2 = ( 11 * maxAlpha + 3 * minAlpha + ALPHA_RANGE ) / 14
- // ab1 = ( 13 * maxAlpha + 1 * minAlpha + ALPHA_RANGE ) / 14
- // ab4 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
- // ab5 = ( 5 * maxAlpha + 9 * minAlpha + ALPHA_RANGE ) / 14
- // ab6 = ( 3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
- // ab7 = ( 1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14
- movd xmm5, maxAlpha_
- pshuflw xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
- pshufd xmm5, xmm5, R_SHUFFLE_D( 0, 0, 0, 0 )
- movdqa xmm7, xmm5
- movd xmm2, minAlpha_
- pshuflw xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
- pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
- movdqa xmm3, xmm2
- pmullw xmm5, SIMD_SSE2_word_scale_7_9_11_13
- pmullw xmm7, SIMD_SSE2_word_scale_7_5_3_1
- pmullw xmm2, SIMD_SSE2_word_scale_7_5_3_1
- pmullw xmm3, SIMD_SSE2_word_scale_7_9_11_13
- paddw xmm5, xmm2
- paddw xmm7, xmm3
- paddw xmm5, SIMD_SSE2_word_7
- paddw xmm7, SIMD_SSE2_word_7
- pmulhw xmm5, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16
- pmulhw xmm7, SIMD_SSE2_word_div_by_14 // * ( ( 1 << 16 ) / 14 + 1 ) ) >> 16
- pshufd xmm1, xmm5, R_SHUFFLE_D( 3, 3, 3, 3 )
- pshufd xmm2, xmm5, R_SHUFFLE_D( 2, 2, 2, 2 )
- pshufd xmm3, xmm5, R_SHUFFLE_D( 1, 1, 1, 1 )
- packuswb xmm1, xmm1 // ab1
- packuswb xmm2, xmm2 // ab2
- packuswb xmm3, xmm3 // ab3
- packuswb xmm0, xmm6 // alpha block
- pshufd xmm4, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
- pshufd xmm5, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
- pshufd xmm6, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
- pshufd xmm7, xmm7, R_SHUFFLE_D( 3, 3, 3, 3 )
- packuswb xmm4, xmm4 // ab4
- packuswb xmm5, xmm5 // ab5
- packuswb xmm6, xmm6 // ab6
- packuswb xmm7, xmm7 // ab7
- pmaxub xmm1, xmm0
- pmaxub xmm2, xmm0
- pmaxub xmm3, xmm0
- pcmpeqb xmm1, xmm0
- pcmpeqb xmm2, xmm0
- pcmpeqb xmm3, xmm0
- pmaxub xmm4, xmm0
- pmaxub xmm5, xmm0
- pmaxub xmm6, xmm0
- pmaxub xmm7, xmm0
- pcmpeqb xmm4, xmm0
- pcmpeqb xmm5, xmm0
- pcmpeqb xmm6, xmm0
- pcmpeqb xmm7, xmm0
- movdqa xmm0, SIMD_SSE2_byte_8
- paddsb xmm0, xmm1
- paddsb xmm2, xmm3
- paddsb xmm4, xmm5
- paddsb xmm6, xmm7
- paddsb xmm0, xmm2
- paddsb xmm4, xmm6
- paddsb xmm0, xmm4
- pand xmm0, SIMD_SSE2_byte_7
- movdqa xmm1, SIMD_SSE2_byte_2
- pcmpgtb xmm1, xmm0
- pand xmm1, SIMD_SSE2_byte_1
- pxor xmm0, xmm1
- movdqa xmm1, xmm0
- movdqa xmm2, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
- movdqa xmm5, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- psrlq xmm1, 8- 3
- psrlq xmm2, 16- 6
- psrlq xmm3, 24- 9
- psrlq xmm4, 32-12
- psrlq xmm5, 40-15
- psrlq xmm6, 48-18
- psrlq xmm7, 56-21
- pand xmm0, SIMD_SSE2_dword_alpha_bit_mask0
- pand xmm1, SIMD_SSE2_dword_alpha_bit_mask1
- pand xmm2, SIMD_SSE2_dword_alpha_bit_mask2
- pand xmm3, SIMD_SSE2_dword_alpha_bit_mask3
- pand xmm4, SIMD_SSE2_dword_alpha_bit_mask4
- pand xmm5, SIMD_SSE2_dword_alpha_bit_mask5
- pand xmm6, SIMD_SSE2_dword_alpha_bit_mask6
- pand xmm7, SIMD_SSE2_dword_alpha_bit_mask7
- por xmm0, xmm1
- por xmm2, xmm3
- por xmm4, xmm5
- por xmm6, xmm7
- por xmm0, xmm2
- por xmm4, xmm6
- por xmm0, xmm4
- mov esi, outPtr
- movd [esi+0], xmm0
- pshufd xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )
- movd [esi+3], xmm1
- }
- outData += 6;
- #elif defined ( ID_WIN_X86_SSE2_INTRIN )
- __m128i block0 = *((__m128i *)(&block[ 0]));
- __m128i block1 = *((__m128i *)(&block[16]));
- __m128i block2 = *((__m128i *)(&block[32]));
- __m128i block3 = *((__m128i *)(&block[48]));
- __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
- temp7 = _mm_cvtsi32_si128( channelBitOffset );
- temp0 = _mm_srl_epi32( block0, temp7 );
- temp5 = _mm_srl_epi32( block1, temp7 );
- temp6 = _mm_srl_epi32( block2, temp7 );
- temp4 = _mm_srl_epi32( block3, temp7 );
- temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_byte_mask );
- temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_byte_mask );
- temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_byte_mask );
- temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_byte_mask );
- temp0 = _mm_packus_epi16( temp0, temp5 );
- temp6 = _mm_packus_epi16( temp6, temp4 );
- //---------------------
- // ab0 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
- // ab3 = ( 9 * maxAlpha + 5 * minAlpha + ALPHA_RANGE ) / 14
- // ab2 = ( 11 * maxAlpha + 3 * minAlpha + ALPHA_RANGE ) / 14
- // ab1 = ( 13 * maxAlpha + 1 * minAlpha + ALPHA_RANGE ) / 14
- // ab4 = ( 7 * maxAlpha + 7 * minAlpha + ALPHA_RANGE ) / 14
- // ab5 = ( 5 * maxAlpha + 9 * minAlpha + ALPHA_RANGE ) / 14
- // ab6 = ( 3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
- // ab7 = ( 1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14
- temp5 = _mm_cvtsi32_si128( maxAlpha_ );
- temp5 = _mm_shufflelo_epi16( temp5, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp2 = _mm_cvtsi32_si128( minAlpha_ );
- temp2 = _mm_shufflelo_epi16( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp2 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp7 = _mm_mullo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_scale_7_5_3_1 );
- temp5 = _mm_mullo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_scale_7_9_11_13 );
- temp3 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_7_9_11_13 );
- temp2 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_7_5_3_1 );
- temp5 = _mm_add_epi16( temp5, temp2 );
- temp7 = _mm_add_epi16( temp7, temp3 );
- temp5 = _mm_add_epi16( temp5, (const __m128i &)SIMD_SSE2_word_7 );
- temp7 = _mm_add_epi16( temp7, (const __m128i &)SIMD_SSE2_word_7 );
- temp5 = _mm_mulhi_epi16( temp5, (const __m128i &)SIMD_SSE2_word_div_by_14 );
- temp7 = _mm_mulhi_epi16( temp7, (const __m128i &)SIMD_SSE2_word_div_by_14 );
- temp1 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 3, 3, 3, 3 ) );
- temp2 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 2, 2, 2, 2 ) );
- temp3 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 1, 1, 1, 1 ) );
- temp1 = _mm_packus_epi16( temp1, temp1 );
- temp2 = _mm_packus_epi16( temp2, temp2 );
- temp3 = _mm_packus_epi16( temp3, temp3 );
- temp0 = _mm_packus_epi16( temp0, temp6 );
- temp4 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp5 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 1, 1, 1, 1 ) );
- temp6 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 2, 2, 2, 2 ) );
- temp7 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 3, 3, 3, 3 ) );
- temp4 = _mm_packus_epi16( temp4, temp4 );
- temp5 = _mm_packus_epi16( temp5, temp5 );
- temp6 = _mm_packus_epi16( temp6, temp6 );
- temp7 = _mm_packus_epi16( temp7, temp7 );
- temp1 = _mm_max_epu8( temp1, temp0 );
- temp2 = _mm_max_epu8( temp2, temp0 );
- temp3 = _mm_max_epu8( temp3, temp0 );
- temp1 = _mm_cmpeq_epi8( temp1, temp0 );
- temp2 = _mm_cmpeq_epi8( temp2, temp0 );
- temp3 = _mm_cmpeq_epi8( temp3, temp0 );
- temp4 = _mm_max_epu8( temp4, temp0 );
- temp5 = _mm_max_epu8( temp5, temp0 );
- temp6 = _mm_max_epu8( temp6, temp0 );
- temp7 = _mm_max_epu8( temp7, temp0 );
- temp4 = _mm_cmpeq_epi8( temp4, temp0 );
- temp5 = _mm_cmpeq_epi8( temp5, temp0 );
- temp6 = _mm_cmpeq_epi8( temp6, temp0 );
- temp7 = _mm_cmpeq_epi8( temp7, temp0 );
- temp0 = _mm_adds_epi8( (const __m128i &)SIMD_SSE2_byte_8, temp1 );
- temp2 = _mm_adds_epi8( temp2, temp3 );
- temp4 = _mm_adds_epi8( temp4, temp5 );
- temp6 = _mm_adds_epi8( temp6, temp7 );
- temp0 = _mm_adds_epi8( temp0, temp2 );
- temp4 = _mm_adds_epi8( temp4, temp6 );
- temp0 = _mm_adds_epi8( temp0, temp4 );
- temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_byte_7 );
- temp1 = _mm_cmpgt_epi8( (const __m128i &)SIMD_SSE2_byte_2, temp0 );
- temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_byte_1 );
- temp0 = _mm_xor_si128( temp0, temp1 );
- temp1 = _mm_srli_epi64( temp0, 8 - 3 );
- temp2 = _mm_srli_epi64( temp0, 16 - 6 );
- temp3 = _mm_srli_epi64( temp0, 24 - 9 );
- temp4 = _mm_srli_epi64( temp0, 32 - 12 );
- temp5 = _mm_srli_epi64( temp0, 40 - 15 );
- temp6 = _mm_srli_epi64( temp0, 48 - 18 );
- temp7 = _mm_srli_epi64( temp0, 56 - 21 );
- temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask0 );
- temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask1 );
- temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask2 );
- temp3 = _mm_and_si128( temp3, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask3 );
- temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask4 );
- temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask5 );
- temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask6 );
- temp7 = _mm_and_si128( temp7, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask7 );
- temp0 = _mm_or_si128( temp0, temp1 );
- temp2 = _mm_or_si128( temp2, temp3 );
- temp4 = _mm_or_si128( temp4, temp5 );
- temp6 = _mm_or_si128( temp6, temp7 );
- temp0 = _mm_or_si128( temp0, temp2 );
- temp4 = _mm_or_si128( temp4, temp6 );
- temp0 = _mm_or_si128( temp0, temp4 );
- int out = _mm_cvtsi128_si32( temp0 );
- EmitUInt( out );
- outData--;
- temp1 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 2, 3, 0, 1 ) );
- out = _mm_cvtsi128_si32( temp1 );
- EmitUInt( out );
- outData--;
- #else
- assert( false );
- #endif
- }
- /*
- ========================
- idDxtEncoder::CompressImageDXT1Fast_SSE2
- params: inBuf - image to compress
- paramO: outBuf - result of compression
- params: width - width of image
- params: height - height of image
- ========================
- */
- void idDxtEncoder::CompressImageDXT1Fast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
- ALIGN16( byte block[64] );
- ALIGN16( byte minColor[4] );
- ALIGN16( byte maxColor[4] );
- assert( width >= 4 && ( width & 3 ) == 0 );
- assert( height >= 4 && ( height & 3 ) == 0 );
- this->width = width;
- this->height = height;
- this->outData = outBuf;
- for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
- for ( int i = 0; i < width; i += 4 ) {
- ExtractBlock_SSE2( inBuf + i * 4, width, block );
- GetMinMaxBBox_SSE2( block, minColor, maxColor );
- InsetColorsBBox_SSE2( minColor, maxColor );
- EmitUShort( ColorTo565( maxColor ) );
- EmitUShort( ColorTo565( minColor ) );
- EmitColorIndices_SSE2( block, minColor, maxColor );
- }
- outData += dstPadding;
- inBuf += srcPadding;
- }
- #ifdef TEST_COMPRESSION
- int tmpDstPadding = dstPadding;
- dstPadding = 0;
- byte * testOutBuf = (byte *) _alloca16( width * height / 2 );
- CompressImageDXT1Fast_Generic( inBuf, testOutBuf, width, height );
- for ( int j = 0; j < height/4; j++ ) {
- for ( int i = 0; i < width/4; i++ ) {
- byte * ptr1 = outBuf + ( j * width/4 + i ) * 8 + j * tmpDstPadding;
- byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 8;
- for ( int k = 0; k < 8; k++ ) {
- assert( ptr1[k] == ptr2[k] );
- }
- }
- }
- dstPadding = tmpDstPadding;
- #endif
- }
- /*
- ========================
- idDxtEncoder::CompressImageDXT1AlphaFast_SSE2
- params: inBuf - image to compress
- paramO: outBuf - result of compression
- params: width - width of image
- params: height - height of image
- ========================
- */
- void idDxtEncoder::CompressImageDXT1AlphaFast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
- ALIGN16( byte block[64] );
- ALIGN16( byte minColor[4] );
- ALIGN16( byte maxColor[4] );
- assert( width >= 4 && ( width & 3 ) == 0 );
- assert( height >= 4 && ( height & 3 ) == 0 );
- this->width = width;
- this->height = height;
- this->outData = outBuf;
- for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
- for ( int i = 0; i < width; i += 4 ) {
- ExtractBlock_SSE2( inBuf + i * 4, width, block );
- GetMinMaxBBox_SSE2( block, minColor, maxColor );
- byte minAlpha = minColor[3];
- InsetColorsBBox_SSE2( minColor, maxColor );
- if ( minAlpha >= 128 ) {
- EmitUShort( ColorTo565( maxColor ) );
- EmitUShort( ColorTo565( minColor ) );
- EmitColorIndices_SSE2( block, minColor, maxColor );
- } else {
- EmitUShort( ColorTo565( minColor ) );
- EmitUShort( ColorTo565( maxColor ) );
- EmitColorAlphaIndices_SSE2( block, minColor, maxColor );
- }
- }
- outData += dstPadding;
- inBuf += srcPadding;
- }
- #ifdef TEST_COMPRESSION
- int tmpDstPadding = dstPadding;
- dstPadding = 0;
- byte * testOutBuf = (byte *) _alloca16( width * height / 2 );
- CompressImageDXT1AlphaFast_Generic( inBuf, testOutBuf, width, height );
- for ( int j = 0; j < height/4; j++ ) {
- for ( int i = 0; i < width/4; i++ ) {
- byte * ptr1 = outBuf + ( j * width/4 + i ) * 8 + j * tmpDstPadding;
- byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 8;
- for ( int k = 0; k < 8; k++ ) {
- assert( ptr1[k] == ptr2[k] );
- }
- }
- }
- dstPadding = tmpDstPadding;
- #endif
- }
- /*
- ========================
- idDxtEncoder::CompressImageDXT5Fast_SSE2
- params: inBuf - image to compress
- paramO: outBuf - result of compression
- params: width - width of image
- params: height - height of image
- ========================
- */
- void idDxtEncoder::CompressImageDXT5Fast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
- ALIGN16( byte block[64] );
- ALIGN16( byte minColor[4] );
- ALIGN16( byte maxColor[4] );
- assert( width >= 4 && ( width & 3 ) == 0 );
- assert( height >= 4 && ( height & 3 ) == 0 );
- this->width = width;
- this->height = height;
- this->outData = outBuf;
- for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
- for ( int i = 0; i < width; i += 4 ) {
- ExtractBlock_SSE2( inBuf + i * 4, width, block );
- GetMinMaxBBox_SSE2( block, minColor, maxColor );
- InsetColorsBBox_SSE2( minColor, maxColor );
- EmitByte( maxColor[3] );
- EmitByte( minColor[3] );
- EmitAlphaIndices_SSE2( block, minColor[3], maxColor[3] );
- EmitUShort( ColorTo565( maxColor ) );
- EmitUShort( ColorTo565( minColor ) );
- EmitColorIndices_SSE2( block, minColor, maxColor );
- }
- outData += dstPadding;
- inBuf += srcPadding;
- }
- #ifdef TEST_COMPRESSION
- int tmpDstPadding = dstPadding;
- dstPadding = 0;
- byte * testOutBuf = (byte *) _alloca16( width * height );
- CompressImageDXT5Fast_Generic( inBuf, testOutBuf, width, height );
- for ( int j = 0; j < height / 4; j++ ) {
- for ( int i = 0; i < width / 4; i++ ) {
- byte * ptr1 = outBuf + ( j * width/4 + i ) * 16 + j * tmpDstPadding;
- byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 16;
- for ( int k = 0; k < 16; k++ ) {
- assert( ptr1[k] == ptr2[k] );
- }
- }
- }
- dstPadding = tmpDstPadding;
- #endif
- }
- /*
- ========================
- idDxtEncoder::ScaleYCoCg_SSE2
- ========================
- */
- ID_INLINE void idDxtEncoder::ScaleYCoCg_SSE2( byte *colorBlock, byte *minColor, byte *maxColor ) const {
- #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
- __asm {
- mov esi, colorBlock
- mov edx, minColor
- mov ecx, maxColor
- movd xmm0, dword ptr [edx]
- movd xmm1, dword ptr [ecx]
- punpcklbw xmm0, SIMD_SSE2_byte_0
- punpcklbw xmm1, SIMD_SSE2_byte_0
- movdqa xmm6, SIMD_SSE2_word_center_128
- movdqa xmm7, SIMD_SSE2_word_center_128
- psubw xmm6, xmm0
- psubw xmm7, xmm1
- psubw xmm0, SIMD_SSE2_word_center_128
- psubw xmm1, SIMD_SSE2_word_center_128
- pmaxsw xmm6, xmm0
- pmaxsw xmm7, xmm1
- pmaxsw xmm6, xmm7
- pshuflw xmm7, xmm6, R_SHUFFLE_D( 1, 0, 1, 0 )
- pmaxsw xmm6, xmm7
- pshufd xmm6, xmm6, R_SHUFFLE_D( 0, 0, 0, 0 )
- movdqa xmm7, xmm6
- pcmpgtw xmm6, SIMD_SSE2_word_63 // mask0
- pcmpgtw xmm7, SIMD_SSE2_word_31 // mask1
- pandn xmm7, SIMD_SSE2_byte_2
- por xmm7, SIMD_SSE2_byte_1
- pandn xmm6, xmm7
- movdqa xmm3, xmm6
- movdqa xmm7, xmm6
- pxor xmm7, SIMD_SSE2_byte_not
- por xmm7, SIMD_SSE2_byte_scale_mask0 // 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00
- paddw xmm6, SIMD_SSE2_byte_1
- pand xmm6, SIMD_SSE2_byte_scale_mask1 // 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF
- por xmm6, SIMD_SSE2_byte_scale_mask2 // 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00
- movd xmm4, dword ptr [edx]
- movd xmm5, dword ptr [ecx]
- pand xmm4, SIMD_SSE2_byte_scale_mask3 // 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0xFF, 0xFF
- pand xmm5, SIMD_SSE2_byte_scale_mask3
- pslld xmm3, 3
- pand xmm3, SIMD_SSE2_byte_scale_mask4 // 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00
- por xmm4, xmm3
- por xmm5, xmm3
- paddb xmm4, SIMD_SSE2_byte_minus_128_0
- paddb xmm5, SIMD_SSE2_byte_minus_128_0
- pmullw xmm4, xmm6
- pmullw xmm5, xmm6
- pand xmm4, xmm7
- pand xmm5, xmm7
- psubb xmm4, SIMD_SSE2_byte_minus_128_0
- psubb xmm5, SIMD_SSE2_byte_minus_128_0
- movd dword ptr [edx], xmm4
- movd dword ptr [ecx], xmm5
- movdqa xmm0, xmmword ptr [esi+ 0*4]
- movdqa xmm1, xmmword ptr [esi+ 4*4]
- movdqa xmm2, xmmword ptr [esi+ 8*4]
- movdqa xmm3, xmmword ptr [esi+12*4]
- paddb xmm0, SIMD_SSE2_byte_minus_128_0
- paddb xmm1, SIMD_SSE2_byte_minus_128_0
- paddb xmm2, SIMD_SSE2_byte_minus_128_0
- paddb xmm3, SIMD_SSE2_byte_minus_128_0
- pmullw xmm0, xmm6
- pmullw xmm1, xmm6
- pmullw xmm2, xmm6
- pmullw xmm3, xmm6
- pand xmm0, xmm7
- pand xmm1, xmm7
- pand xmm2, xmm7
- pand xmm3, xmm7
- psubb xmm0, SIMD_SSE2_byte_minus_128_0
- psubb xmm1, SIMD_SSE2_byte_minus_128_0
- psubb xmm2, SIMD_SSE2_byte_minus_128_0
- psubb xmm3, SIMD_SSE2_byte_minus_128_0
- movdqa xmmword ptr [esi+ 0*4], xmm0
- movdqa xmmword ptr [esi+ 4*4], xmm1
- movdqa xmmword ptr [esi+ 8*4], xmm2
- movdqa xmmword ptr [esi+12*4], xmm3
- }
- #elif defined ( ID_WIN_X86_SSE2_INTRIN )
- __m128i block0 = *((__m128i *)(&colorBlock[ 0]));
- __m128i block1 = *((__m128i *)(&colorBlock[16]));
- __m128i block2 = *((__m128i *)(&colorBlock[32]));
- __m128i block3 = *((__m128i *)(&colorBlock[48]));
- __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
- temp0 = _mm_cvtsi32_si128( *(int *)minColor );
- temp1 = _mm_cvtsi32_si128( *(int *)maxColor );
- temp0 = _mm_unpacklo_epi8( temp0, (const __m128i &)SIMD_SSE2_byte_0 );
- temp1 = _mm_unpacklo_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_0 );
- // TODO: Algorithm seems to be get the absolute difference
- temp6 = _mm_sub_epi16( (const __m128i &)SIMD_SSE2_word_center_128, temp0 );
- temp7 = _mm_sub_epi16( (const __m128i &)SIMD_SSE2_word_center_128, temp1 );
- temp0 = _mm_sub_epi16( temp0, (const __m128i &)SIMD_SSE2_word_center_128 );
- temp1 = _mm_sub_epi16( temp1, (const __m128i &)SIMD_SSE2_word_center_128 );
- temp6 = _mm_max_epi16( temp6, temp0 );
- temp7 = _mm_max_epi16( temp7, temp1 );
- temp6 = _mm_max_epi16( temp6, temp7 );
- temp7 = _mm_shufflelo_epi16( temp6, R_SHUFFLE_D( 1, 0, 1, 0 ) );
- temp6 = _mm_max_epi16( temp6, temp7 );
- temp6 = _mm_shuffle_epi32( temp6, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp7 = temp6;
- temp6 = _mm_cmpgt_epi16( temp6, (const __m128i &)SIMD_SSE2_word_63 ); // mask0
- temp7 = _mm_cmpgt_epi16( temp7, (const __m128i &)SIMD_SSE2_word_31 ); // mask1
- temp7 = _mm_andnot_si128( temp7, (const __m128i &)SIMD_SSE2_byte_2 );
- temp7 = _mm_or_si128( temp7, (const __m128i &)SIMD_SSE2_byte_1 );
- temp6 = _mm_andnot_si128( temp6, temp7 );
- temp3 = temp6;
- temp7 = temp6;
- temp7 = _mm_xor_si128( temp7, (const __m128i &)SIMD_SSE2_byte_not );
- temp7 = _mm_or_si128( temp7, (const __m128i &)SIMD_SSE2_byte_scale_mask0 ); // 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00
- temp6 = _mm_add_epi16( temp6, (const __m128i &)SIMD_SSE2_byte_1 );
- temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_byte_scale_mask1 ); // 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF
- temp6 = _mm_or_si128( temp6, (const __m128i &)SIMD_SSE2_byte_scale_mask2 ); // 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00
- // TODO: remove this second store
- temp4 = _mm_cvtsi32_si128( *(int *)minColor );
- temp5 = _mm_cvtsi32_si128( *(int *)maxColor );
- temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_byte_scale_mask3 ); // 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0xFF, 0xFF
- temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_byte_scale_mask3 );
- temp3 = _mm_slli_epi32( temp3, 3 );
- temp3 = _mm_and_si128( temp3, (const __m128i &)SIMD_SSE2_byte_scale_mask4 ); // 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00
- temp4 = _mm_or_si128( temp4, temp3 );
- temp5 = _mm_or_si128( temp5, temp3 );
- temp4 = _mm_add_epi8( temp4, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
- temp5 = _mm_add_epi8( temp5, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
- temp4 = _mm_mullo_epi16( temp4, temp6 );
- temp5 = _mm_mullo_epi16( temp5, temp6 );
- temp4 = _mm_and_si128( temp4, temp7 );
- temp5 = _mm_and_si128( temp5, temp7 );
- temp4 = _mm_sub_epi8( temp4, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
- temp5 = _mm_sub_epi8( temp5, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
- *(int *)minColor = _mm_cvtsi128_si32( temp4 );
- *(int *)maxColor = _mm_cvtsi128_si32( temp5 );
- temp0 = _mm_add_epi8( block0, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
- temp1 = _mm_add_epi8( block1, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
- temp2 = _mm_add_epi8( block2, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
- temp3 = _mm_add_epi8( block3, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
- temp0 = _mm_mullo_epi16( temp0, temp6 );
- temp1 = _mm_mullo_epi16( temp1, temp6 );
- temp2 = _mm_mullo_epi16( temp2, temp6 );
- temp3 = _mm_mullo_epi16( temp3, temp6 );
- temp0 = _mm_and_si128( temp0, temp7 );
- temp1 = _mm_and_si128( temp1, temp7 );
- temp2 = _mm_and_si128( temp2, temp7 );
- temp3 = _mm_and_si128( temp3, temp7 );
- *((__m128i *)(&colorBlock[ 0])) = _mm_sub_epi8( temp0, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
- *((__m128i *)(&colorBlock[16])) = _mm_sub_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
- *((__m128i *)(&colorBlock[32])) = _mm_sub_epi8( temp2, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
- *((__m128i *)(&colorBlock[48])) = _mm_sub_epi8( temp3, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
- #else
- assert( false );
- #endif
- }
- /*
- ========================
- idDxtEncoder::InsetYCoCgBBox_SSE2
- ========================
- */
- ID_INLINE void idDxtEncoder::InsetYCoCgBBox_SSE2( byte *minColor, byte *maxColor ) const {
- #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
- __asm {
- mov esi, minColor
- mov edi, maxColor
- movd xmm0, dword ptr [esi]
- movd xmm1, dword ptr [edi]
- punpcklbw xmm0, SIMD_SSE2_byte_0
- punpcklbw xmm1, SIMD_SSE2_byte_0
- movdqa xmm2, xmm1
- psubw xmm2, xmm0
- psubw xmm2, SIMD_SSE2_word_insetYCoCgRound
- pand xmm2, SIMD_SSE2_word_insetYCoCgMask
- pmullw xmm0, SIMD_SSE2_word_insetYCoCgShiftUp
- pmullw xmm1, SIMD_SSE2_word_insetYCoCgShiftUp
- paddw xmm0, xmm2
- psubw xmm1, xmm2
- pmulhw xmm0, SIMD_SSE2_word_insetYCoCgShiftDown
- pmulhw xmm1, SIMD_SSE2_word_insetYCoCgShiftDown
- pmaxsw xmm0, SIMD_SSE2_word_0
- pmaxsw xmm1, SIMD_SSE2_word_0
- pand xmm0, SIMD_SSE2_word_insetYCoCgQuantMask
- pand xmm1, SIMD_SSE2_word_insetYCoCgQuantMask
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- pmulhw xmm2, SIMD_SSE2_word_insetYCoCgRep
- pmulhw xmm3, SIMD_SSE2_word_insetYCoCgRep
- por xmm0, xmm2
- por xmm1, xmm3
- packuswb xmm0, xmm0
- packuswb xmm1, xmm1
- movd dword ptr [esi], xmm0
- movd dword ptr [edi], xmm1
- }
- #elif defined ( ID_WIN_X86_SSE2_INTRIN )
- __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
- temp0 = _mm_cvtsi32_si128( *(int *)minColor );
- temp1 = _mm_cvtsi32_si128( *(int *)maxColor );
- temp0 = _mm_unpacklo_epi8( temp0, (const __m128i &)SIMD_SSE2_byte_0 );
- temp1 = _mm_unpacklo_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_0 );
- temp2 = _mm_sub_epi16( temp1, temp0 );
- temp2 = _mm_sub_epi16( temp2, (const __m128i &)SIMD_SSE2_word_insetYCoCgRound );
- temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_insetYCoCgMask );
- temp0 = _mm_mullo_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetYCoCgShiftUp );
- temp1 = _mm_mullo_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetYCoCgShiftUp );
- temp0 = _mm_add_epi16( temp0, temp2 );
- temp1 = _mm_sub_epi16( temp1, temp2 );
- temp0 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetYCoCgShiftDown );
- temp1 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetYCoCgShiftDown );
- temp0 = _mm_max_epi16( temp0, (const __m128i &)SIMD_SSE2_word_0 );
- temp1 = _mm_max_epi16( temp1, (const __m128i &)SIMD_SSE2_word_0 );
- temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_word_insetYCoCgQuantMask );
- temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_word_insetYCoCgQuantMask );
- temp2 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetYCoCgRep );
- temp3 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetYCoCgRep );
- temp0 = _mm_or_si128( temp0, temp2 );
- temp1 = _mm_or_si128( temp1, temp3 );
- temp0 = _mm_packus_epi16( temp0, temp0 );
- temp1 = _mm_packus_epi16( temp1, temp1 );
- *(int *)minColor = _mm_cvtsi128_si32( temp0 );
- *(int *)maxColor = _mm_cvtsi128_si32( temp1 );
- #else
- assert( false );
- #endif
- }
- /*
- ========================
- idDxtEncoder::SelectYCoCgDiagonal_SSE2
- params: colorBlock - 16 pixel block to find color indexes for
- paramO: minColor - min color found
- paramO: maxColor - max color found
- return: diagonal to use
- ========================
- */
- ID_INLINE void idDxtEncoder::SelectYCoCgDiagonal_SSE2( const byte *colorBlock, byte *minColor, byte *maxColor ) const {
- #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
- __asm {
- mov esi, colorBlock
- mov edx, minColor
- mov ecx, maxColor
- movdqa xmm0, xmmword ptr [esi+ 0]
- movdqa xmm1, xmmword ptr [esi+16]
- movdqa xmm2, xmmword ptr [esi+32]
- movdqa xmm3, xmmword ptr [esi+48]
- pand xmm0, SIMD_SSE2_dword_word_mask
- pand xmm1, SIMD_SSE2_dword_word_mask
- pand xmm2, SIMD_SSE2_dword_word_mask
- pand xmm3, SIMD_SSE2_dword_word_mask
- pslldq xmm1, 2
- pslldq xmm3, 2
- por xmm0, xmm1
- por xmm2, xmm3
- movd xmm1, dword ptr [edx] // minColor
- movd xmm3, dword ptr [ecx] // maxColor
- movdqa xmm6, xmm1
- movdqa xmm7, xmm3
- pavgb xmm1, xmm3
- pshuflw xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 )
- pshufd xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 )
- movdqa xmm3, xmm1
- pmaxub xmm1, xmm0
- pmaxub xmm3, xmm2
- pcmpeqb xmm1, xmm0
- pcmpeqb xmm3, xmm2
- movdqa xmm0, xmm1
- movdqa xmm2, xmm3
- psrldq xmm0, 1
- psrldq xmm2, 1
- pxor xmm0, xmm1
- pxor xmm2, xmm3
- pand xmm0, SIMD_SSE2_word_1
- pand xmm2, SIMD_SSE2_word_1
- paddw xmm0, xmm2
- psadbw xmm0, SIMD_SSE2_byte_0
- pshufd xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )
- #ifdef NVIDIA_7X_HARDWARE_BUG_FIX
- paddw xmm1, xmm0 // side
- pcmpgtw xmm1, SIMD_SSE2_word_8 // mask = -( side > 8 )
- pand xmm1, SIMD_SSE2_byte_diagonalMask
- movdqa xmm0, xmm6
- pcmpeqb xmm0, xmm7 // mask &= -( minColor[0] != maxColor[0] )
- pslldq xmm0, 1
- pandn xmm0, xmm1
- #else
- paddw xmm0, xmm1 // side
- pcmpgtw xmm0, SIMD_SSE2_word_8 // mask = -( side > 8 )
- pand xmm0, SIMD_SSE2_byte_diagonalMask
- #endif
- pxor xmm6, xmm7
- pand xmm0, xmm6
- pxor xmm7, xmm0
- pxor xmm6, xmm7
- movd dword ptr [edx], xmm6
- movd dword ptr [ecx], xmm7
- }
- #elif defined ( ID_WIN_X86_SSE2_INTRIN )
- __m128i block0 = *((__m128i *)(&colorBlock[ 0]));
- __m128i block1 = *((__m128i *)(&colorBlock[16]));
- __m128i block2 = *((__m128i *)(&colorBlock[32]));
- __m128i block3 = *((__m128i *)(&colorBlock[48]));
- __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
- temp0 = _mm_and_si128( block0, (const __m128i &)SIMD_SSE2_dword_word_mask );
- temp1 = _mm_and_si128( block1, (const __m128i &)SIMD_SSE2_dword_word_mask );
- temp2 = _mm_and_si128( block2, (const __m128i &)SIMD_SSE2_dword_word_mask );
- temp3 = _mm_and_si128( block3, (const __m128i &)SIMD_SSE2_dword_word_mask );
- temp1 = _mm_slli_si128( temp1, 2 );
- temp3 = _mm_slli_si128( temp3, 2 );
- temp0 = _mm_or_si128( temp0, temp1 );
- temp2 = _mm_or_si128( temp2, temp3 );
- temp6 = _mm_cvtsi32_si128( *(int *)minColor );
- temp7 = _mm_cvtsi32_si128( *(int *)maxColor );
- temp1 = _mm_avg_epu8( temp6, temp7 );
- temp1 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp1 = _mm_shuffle_epi32( temp1, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp3 = _mm_max_epu8( temp1, temp2 );
- temp1 = _mm_max_epu8( temp1, temp0 );
- temp1 = _mm_cmpeq_epi8( temp1, temp0 );
- temp3 = _mm_cmpeq_epi8( temp3, temp2 );
- temp0 = _mm_srli_si128( temp1, 1 );
- temp2 = _mm_srli_si128( temp3, 1 );
- temp0 = _mm_xor_si128( temp0, temp1 );
- temp2 = _mm_xor_si128( temp2, temp3 );
- temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_word_1 );
- temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_1 );
- temp0 = _mm_add_epi16( temp0, temp2 );
- temp0 = _mm_sad_epu8( temp0, (const __m128i &)SIMD_SSE2_byte_0 );
- temp1 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 2, 3, 0, 1 ) );
- #ifdef NVIDIA_7X_HARDWARE_BUG_FIX
- temp1 = _mm_add_epi16( temp1, temp0 );
- temp1 = _mm_cmpgt_epi16( temp1, (const __m128i &)SIMD_SSE2_word_8 );
- temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_byte_diagonalMask );
- temp0 = _mm_cmpeq_epi8( temp6, temp7 );
- temp0 = _mm_slli_si128( temp0, 1 );
- temp0 = _mm_andnot_si128( temp0, temp1 );
- #else
- temp0 = _mm_add_epi16( temp0, temp1 );
- temp0 = _mm_cmpgt_epi16( temp0, (const __m128i &)SIMD_SSE2_word_8 );
- temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_byte_diagonalMask );
- #endif
- temp6 = _mm_xor_si128( temp6, temp7 );
- temp0 = _mm_and_si128( temp0, temp6 );
- temp7 = _mm_xor_si128( temp7, temp0 );
- temp6 = _mm_xor_si128( temp6, temp7 );
- *(int *)minColor = _mm_cvtsi128_si32( temp6 );
- *(int *)maxColor = _mm_cvtsi128_si32( temp7 );
- #else
- assert( false );
- #endif
- }
- /*
- ========================
- idDxtEncoder::CompressYCoCgDXT5Fast_SSE2
- params: inBuf - image to compress
- paramO: outBuf - result of compression
- params: width - width of image
- params: height - height of image
- ========================
- */
- void idDxtEncoder::CompressYCoCgDXT5Fast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
- ALIGN16( byte block[64] );
- ALIGN16( byte minColor[4] );
- ALIGN16( byte maxColor[4] );
- //assert( HasConstantValuePer4x4Block( inBuf, width, height, 2 ) );
- assert( width >= 4 && ( width & 3 ) == 0 );
- assert( height >= 4 && ( height & 3 ) == 0 );
- this->width = width;
- this->height = height;
- this->outData = outBuf;
- for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
- for ( int i = 0; i < width; i += 4 ) {
- ExtractBlock_SSE2( inBuf + i * 4, width, block );
- GetMinMaxBBox_SSE2( block, minColor, maxColor );
- ScaleYCoCg_SSE2( block, minColor, maxColor );
- InsetYCoCgBBox_SSE2( minColor, maxColor );
- SelectYCoCgDiagonal_SSE2( block, minColor, maxColor );
- EmitByte( maxColor[3] );
- EmitByte( minColor[3] );
- EmitAlphaIndices_SSE2( block, minColor[3], maxColor[3] );
- EmitUShort( ColorTo565( maxColor ) );
- EmitUShort( ColorTo565( minColor ) );
- EmitCoCgIndices_SSE2( block, minColor, maxColor );
- }
- outData += dstPadding;
- inBuf += srcPadding;
- }
- #ifdef TEST_COMPRESSION
- int tmpDstPadding = dstPadding;
- dstPadding = 0;
- byte * testOutBuf = (byte *) _alloca16( width * height );
- CompressYCoCgDXT5Fast_Generic( inBuf, testOutBuf, width, height );
- for ( int j = 0; j < height / 4; j++ ) {
- for ( int i = 0; i < width / 4; i++ ) {
- byte * ptr1 = outBuf + ( j * width/4 + i ) * 16 + j * tmpDstPadding;
- byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 16;
- for ( int k = 0; k < 16; k++ ) {
- assert( ptr1[k] == ptr2[k] );
- }
- }
- }
- dstPadding = tmpDstPadding;
- #endif
- }
- /*
- ========================
- idDxtEncoder::EmitGreenIndices_SSE2
- params: block - 16-normal block for which to find normal Y indices
- paramO: minGreen - Minimal normal Y found
- paramO: maxGreen - Maximal normal Y found
- ========================
- */
- void idDxtEncoder::EmitGreenIndices_SSE2( const byte *block, const int channelBitOffset, const int minGreen, const int maxGreen ) {
- #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
- assert( maxGreen >= minGreen );
- byte *outPtr = outData;
- __asm {
- movd xmm7, channelBitOffset
- mov esi, block
- movdqa xmm0, xmmword ptr [esi+ 0]
- movdqa xmm5, xmmword ptr [esi+16]
- movdqa xmm6, xmmword ptr [esi+32]
- movdqa xmm4, xmmword ptr [esi+48]
- psrld xmm0, xmm7
- psrld xmm5, xmm7
- psrld xmm6, xmm7
- psrld xmm4, xmm7
- pand xmm0, SIMD_SSE2_dword_byte_mask
- pand xmm5, SIMD_SSE2_dword_byte_mask
- pand xmm6, SIMD_SSE2_dword_byte_mask
- pand xmm4, SIMD_SSE2_dword_byte_mask
- packuswb xmm0, xmm5
- packuswb xmm6, xmm4
- //---------------------
- movd xmm2, maxGreen
- pshuflw xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
- movd xmm3, minGreen
- pshuflw xmm3, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 )
- pmullw xmm2, SIMD_SSE2_word_scale_5_3_1
- pmullw xmm3, SIMD_SSE2_word_scale_1_3_5
- paddw xmm2, SIMD_SSE2_word_3
- paddw xmm3, xmm2
- pmulhw xmm3, SIMD_SSE2_word_div_by_6
- pshuflw xmm1, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 )
- pshuflw xmm2, xmm3, R_SHUFFLE_D( 1, 1, 1, 1 )
- pshuflw xmm3, xmm3, R_SHUFFLE_D( 2, 2, 2, 2 )
- pshufd xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 )
- pshufd xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
- pshufd xmm3, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 )
- packuswb xmm1, xmm1
- packuswb xmm2, xmm2
- packuswb xmm3, xmm3
- packuswb xmm0, xmm6
- pmaxub xmm1, xmm0
- pmaxub xmm2, xmm0
- pmaxub xmm3, xmm0
- pcmpeqb xmm1, xmm0
- pcmpeqb xmm2, xmm0
- pcmpeqb xmm3, xmm0
- movdqa xmm0, SIMD_SSE2_byte_4
- paddsb xmm0, xmm1
- paddsb xmm2, xmm3
- paddsb xmm0, xmm2
- pand xmm0, SIMD_SSE2_byte_3
- movdqa xmm4, SIMD_SSE2_byte_2
- pcmpgtb xmm4, xmm0
- pand xmm4, SIMD_SSE2_byte_1
- pxor xmm0, xmm4
- movdqa xmm4, xmm0
- movdqa xmm5, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- psrlq xmm4, 8- 2
- psrlq xmm5, 16- 4
- psrlq xmm6, 24- 6
- psrlq xmm7, 32- 8
- pand xmm4, SIMD_SSE2_dword_color_bit_mask1
- pand xmm5, SIMD_SSE2_dword_color_bit_mask2
- pand xmm6, SIMD_SSE2_dword_color_bit_mask3
- pand xmm7, SIMD_SSE2_dword_color_bit_mask4
- por xmm5, xmm4
- por xmm7, xmm6
- por xmm7, xmm5
- movdqa xmm4, xmm0
- movdqa xmm5, xmm0
- movdqa xmm6, xmm0
- psrlq xmm4, 40-10
- psrlq xmm5, 48-12
- psrlq xmm6, 56-14
- pand xmm0, SIMD_SSE2_dword_color_bit_mask0
- pand xmm4, SIMD_SSE2_dword_color_bit_mask5
- pand xmm5, SIMD_SSE2_dword_color_bit_mask6
- pand xmm6, SIMD_SSE2_dword_color_bit_mask7
- por xmm4, xmm5
- por xmm0, xmm6
- por xmm7, xmm4
- por xmm7, xmm0
- mov esi, outPtr
- pshufd xmm7, xmm7, R_SHUFFLE_D( 0, 2, 1, 3 )
- pshuflw xmm7, xmm7, R_SHUFFLE_D( 0, 2, 1, 3 )
- movd [esi], xmm7
- }
- outData += 4;
- #elif defined ( ID_WIN_X86_SSE2_INTRIN )
- __m128i block0 = *((__m128i *)(&block[ 0]));
- __m128i block1 = *((__m128i *)(&block[16]));
- __m128i block2 = *((__m128i *)(&block[32]));
- __m128i block3 = *((__m128i *)(&block[48]));
- __m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
- temp7 = _mm_cvtsi32_si128( channelBitOffset );
- temp0 = _mm_srl_epi32( block0, temp7 );
- temp5 = _mm_srl_epi32( block1, temp7 );
- temp6 = _mm_srl_epi32( block2, temp7 );
- temp4 = _mm_srl_epi32( block3, temp7 );
- temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_byte_mask );
- temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_byte_mask );
- temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_byte_mask );
- temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_byte_mask );
- temp0 = _mm_packus_epi16( temp0, temp5 );
- temp6 = _mm_packus_epi16( temp6, temp4 );
- //---------------------
- temp2 = _mm_cvtsi32_si128( maxGreen );
- temp2 = _mm_shufflelo_epi16( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp3 = _mm_cvtsi32_si128( minGreen );
- temp3 = _mm_shufflelo_epi16( temp3, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp2 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_5_3_1 );
- temp3 = _mm_mullo_epi16( temp3, (const __m128i &)SIMD_SSE2_word_scale_1_3_5 );
- temp2 = _mm_add_epi16( temp2, (const __m128i &)SIMD_SSE2_word_3 );
- temp3 = _mm_add_epi16( temp3, temp2 );
- temp3 = _mm_mulhi_epi16( temp3, (const __m128i &)SIMD_SSE2_word_div_by_6 );
- temp1 = _mm_shufflelo_epi16( temp3, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp2 = _mm_shufflelo_epi16( temp3, R_SHUFFLE_D( 1, 1, 1, 1 ) );
- temp3 = _mm_shufflelo_epi16( temp3, R_SHUFFLE_D( 2, 2, 2, 2 ) );
- temp1 = _mm_shuffle_epi32( temp1, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp2 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp3 = _mm_shuffle_epi32( temp3, R_SHUFFLE_D( 0, 0, 0, 0 ) );
- temp1 = _mm_packus_epi16( temp1, temp1 );
- temp2 = _mm_packus_epi16( temp2, temp2 );
- temp3 = _mm_packus_epi16( temp3, temp3 );
- temp0 = _mm_packus_epi16( temp0, temp6 );
- temp1 = _mm_max_epu8( temp1, temp0 );
- temp2 = _mm_max_epu8( temp2, temp0 );
- temp3 = _mm_max_epu8( temp3, temp0 );
- temp1 = _mm_cmpeq_epi8( temp1, temp0 );
- temp2 = _mm_cmpeq_epi8( temp2, temp0 );
- temp3 = _mm_cmpeq_epi8( temp3, temp0 );
- temp0 = (const __m128i &)SIMD_SSE2_byte_4;
- temp0 = _mm_adds_epi8( temp0, temp1 );
- temp2 = _mm_adds_epi8( temp2, temp3 );
- temp0 = _mm_adds_epi8( temp0, temp2 );
- temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_byte_3 );
- temp4 = (const __m128i &)SIMD_SSE2_byte_2;
- temp4 = _mm_cmpgt_epi8( temp4, temp0 );
- temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_byte_1 );
- temp0 = _mm_xor_si128( temp0, temp4 );
- temp4 = _mm_srli_epi64( temp0, 8 - 2 );
- temp5 = _mm_srli_epi64( temp0, 16 - 4 );
- temp6 = _mm_srli_epi64( temp0, 24 - 6 );
- temp7 = _mm_srli_epi64( temp0, 32 - 8 );
- temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_color_bit_mask1 );
- temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_color_bit_mask2 );
- temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_color_bit_mask3 );
- temp7 = _mm_and_si128( temp7, (const __m128i &)SIMD_SSE2_dword_color_bit_mask4 );
- temp5 = _mm_or_si128( temp5, temp4 );
- temp7 = _mm_or_si128( temp7, temp6 );
- temp7 = _mm_or_si128( temp7, temp5 );
- temp4 = _mm_srli_epi64( temp0, 40 - 10 );
- temp5 = _mm_srli_epi64( temp0, 48 - 12 );
- temp6 = _mm_srli_epi64( temp0, 56 - 14 );
- temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_color_bit_mask0 );
- temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_color_bit_mask5 );
- temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_color_bit_mask6 );
- temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_color_bit_mask7 );
- temp4 = _mm_or_si128( temp4, temp5 );
- temp0 = _mm_or_si128( temp0, temp6 );
- temp7 = _mm_or_si128( temp7, temp4 );
- temp7 = _mm_or_si128( temp7, temp0 );
- temp7 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 0, 2, 1, 3 ) );
- temp7 = _mm_shufflelo_epi16( temp7, R_SHUFFLE_D( 0, 2, 1, 3 ) );
- int result = _mm_cvtsi128_si32( temp7 );
- EmitUInt( result );
- #else
- assert( false );
- #endif
- }
- /*
- ========================
- idDxtEncoder::InsetNormalsBBoxDXT5_SSE2
- ========================
- */
- void idDxtEncoder::InsetNormalsBBoxDXT5_SSE2( byte *minNormal, byte *maxNormal ) const {
- #if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
- __asm {
- mov esi, minNormal
- mov edi, maxNormal
- movd xmm0, dword ptr [esi] // xmm0 = minNormal
- movd xmm1, dword ptr [edi] // xmm1 = maxNormal
- punpcklbw xmm0, SIMD_SSE2_byte_0
- punpcklbw xmm1, SIMD_SSE2_byte_0
- movdqa xmm2, xmm1
- psubw xmm2, xmm0
- psubw xmm2, SIMD_SSE2_word_insetNormalDXT5Round
- pand xmm2, SIMD_SSE2_word_insetNormalDXT5Mask // xmm2 = inset (1 & 3)
- pmullw xmm0, SIMD_SSE2_word_insetNormalDXT5ShiftUp
- pmullw xmm1, SIMD_SSE2_word_insetNormalDXT5ShiftUp
- paddw xmm0, xmm2
- psubw xmm1, xmm2
- pmulhw xmm0, SIMD_SSE2_word_insetNormalDXT5ShiftDown // xmm0 = mini
- pmulhw xmm1, SIMD_SSE2_word_insetNormalDXT5ShiftDown // xmm1 = maxi
- // mini and maxi must be >= 0 and <= 255
- pmaxsw xmm0, SIMD_SSE2_word_0
- pmaxsw xmm1, SIMD_SSE2_word_0
- pminsw xmm0, SIMD_SSE2_word_255
- pminsw xmm1, SIMD_SSE2_word_255
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- pand xmm0, SIMD_SSE2_word_insetNormalDXT5QuantMask
- pand xmm1, SIMD_SSE2_word_insetNormalDXT5QuantMask
- pmulhw xmm2, SIMD_SSE2_word_insetNormalDXT5Rep
- pmulhw xmm3, SIMD_SSE2_word_insetNormalDXT5Rep
- por xmm0, xmm2
- por xmm1, xmm3
- packuswb xmm0, xmm0
- packuswb xmm1, xmm1
- movd dword ptr [esi], xmm0
- movd dword ptr [edi], xmm1
- }
- #elif defined ( ID_WIN_X86_SSE2_INTRIN )
- __m128i temp0, temp1, temp2, temp3;
- temp0 = _mm_cvtsi32_si128( *(int *)minNormal );
- temp1 = _mm_cvtsi32_si128( *(int *)maxNormal );
- temp0 = _mm_unpacklo_epi8( temp0, (const __m128i &)SIMD_SSE2_byte_0 );
- temp1 = _mm_unpacklo_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_0 );
- temp2 = _mm_sub_epi16( temp1, temp0 );
- temp2 = _mm_sub_epi16( temp2, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5Round );
- temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5Mask ); // xmm2 = inset (1 & 3)
- temp0 = _mm_mullo_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5ShiftUp );
- temp1 = _mm_mullo_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5ShiftUp );
- temp0 = _mm_add_epi16( temp0, temp2 );
- temp1 = _mm_sub_epi16( temp1, temp2 );
- temp0 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5ShiftDown ); // xmm0 = mini
- temp1 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5ShiftDown ); // xmm1 = maxi
- // mini and maxi must be >= 0 and <= 255
- temp0 = _mm_max_epi16( temp0, (const __m128i &)SIMD_SSE2_word_0 );
- temp1 = _mm_max_epi16( temp1, (const __m128i &)SIMD_SSE2_word_0 );
- temp0 = _mm_min_epi16( temp0, (const __m128i &)SIMD_SSE2_word_255 );
- temp1 = _mm_min_epi16( temp1, (const __m128i &)SIMD_SSE2_word_255 );
- temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5QuantMask );
- temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5QuantMask );
- temp2 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5Rep );
- temp3 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5Rep );
- temp0 = _mm_or_si128( temp0, temp2 );
- temp1 = _mm_or_si128( temp1, temp3 );
- temp0 = _mm_packus_epi16( temp0, temp0 );
- temp1 = _mm_packus_epi16( temp1, temp1 );
- *(int *)minNormal = _mm_cvtsi128_si32( temp0 );
- *(int *)maxNormal = _mm_cvtsi128_si32( temp1 );
- #else
- assert( false );
- #endif
- }
- /*
- ========================
- idDxtEncoder::CompressNormalMapDXT5Fast_SSE2
- params: inBuf - image to compress in _y_x component order
- paramO: outBuf - result of compression
- params: width - width of image
- params: height - height of image
- ========================
- */
- void idDxtEncoder::CompressNormalMapDXT5Fast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
- ALIGN16( byte block[64] );
- ALIGN16( byte normal1[4] );
- ALIGN16( byte normal2[4] );
- assert( width >= 4 && ( width & 3 ) == 0 );
- assert( height >= 4 && ( height & 3 ) == 0 );
- this->width = width;
- this->height = height;
- this->outData = outBuf;
- for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
- for ( int i = 0; i < width; i += 4 ) {
- ExtractBlock_SSE2( inBuf + i * 4, width, block );
- GetMinMaxBBox_SSE2( block, normal1, normal2 );
- InsetNormalsBBoxDXT5_SSE2( normal1, normal2 );
- // Write out Nx into alpha channel.
- EmitByte( normal2[3] );
- EmitByte( normal1[3] );
- EmitAlphaIndices_SSE2( block, 3*8, normal1[3], normal2[3] );
- // Write out Ny into green channel.
- EmitUShort( ColorTo565( block[0], normal2[1], block[2] ) );
- EmitUShort( ColorTo565( block[0], normal1[1], block[2] ) );
- EmitGreenIndices_SSE2( block, 1*8, normal1[1], normal2[1] );
- }
- outData += dstPadding;
- inBuf += srcPadding;
- }
- #ifdef TEST_COMPRESSION
- int tmpDstPadding = dstPadding;
- dstPadding = 0;
- byte * testOutBuf = (byte *) _alloca16( width * height );
- CompressNormalMapDXT5Fast_Generic( inBuf, testOutBuf, width, height );
- for ( int j = 0; j < height / 4; j++ ) {
- for ( int i = 0; i < width / 4; i++ ) {
- byte * ptr1 = outBuf + ( j * width/4 + i ) * 16 + j * tmpDstPadding;
- byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 16;
- for ( int k = 0; k < 16; k++ ) {
- assert( ptr1[k] == ptr2[k] );
- }
- }
- }
- dstPadding = tmpDstPadding;
- #endif
- }
- #endif
|