Util.pm 164 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791
  1. package Compression::Util;
  2. use utf8;
  3. use 5.036;
  4. use List::Util qw(min uniq max sum all);
  5. use Carp qw(confess);
  6. require Exporter;
  7. our @ISA = qw(Exporter);
  8. our $VERSION = '0.13';
  9. our $VERBOSE = 0; # verbose mode
  10. our $LZ_MIN_LEN = 4; # minimum match length in LZ parsing
  11. our $LZ_MAX_LEN = 1 << 15; # maximum match length in LZ parsing
  12. our $LZ_MAX_DIST = ~0; # maximum allowed back-reference distance in LZ parsing
  13. our $LZ_MAX_CHAIN_LEN = 32; # how many recent positions to remember in LZ parsing
  14. # Arithmetic Coding settings
  15. use constant BITS => 32;
  16. use constant MAX => oct('0b' . ('1' x BITS));
  17. use constant INITIAL_FREQ => 1;
  18. our %EXPORT_TAGS = (
  19. 'all' => [
  20. qw(
  21. crc32
  22. read_bit
  23. read_bit_lsb
  24. read_bits
  25. read_bits_lsb
  26. int2bits
  27. int2bits_lsb
  28. int2bytes
  29. int2bytes_lsb
  30. bits2int
  31. bits2int_lsb
  32. bytes2int
  33. bytes2int_lsb
  34. string2symbols
  35. symbols2string
  36. read_null_terminated
  37. bwt_encode
  38. bwt_decode
  39. bwt_encode_symbolic
  40. bwt_decode_symbolic
  41. bwt_sort
  42. bwt_sort_symbolic
  43. bwt_compress
  44. bwt_decompress
  45. bwt_compress_symbolic
  46. bwt_decompress_symbolic
  47. bzip2_compress
  48. bzip2_decompress
  49. gzip_compress
  50. gzip_decompress
  51. mrl_compress
  52. mrl_decompress
  53. mrl_compress_symbolic
  54. mrl_decompress_symbolic
  55. create_huffman_entry
  56. decode_huffman_entry
  57. delta_encode
  58. delta_decode
  59. huffman_encode
  60. huffman_decode
  61. huffman_from_freq
  62. huffman_from_symbols
  63. huffman_from_code_lengths
  64. mtf_encode
  65. mtf_decode
  66. encode_alphabet
  67. decode_alphabet
  68. encode_alphabet_256
  69. decode_alphabet_256
  70. deltas
  71. accumulate
  72. frequencies
  73. run_length
  74. binary_vrl_encode
  75. binary_vrl_decode
  76. rle4_encode
  77. rle4_decode
  78. zrle_encode
  79. zrle_decode
  80. lzss_compress
  81. lzss_decompress
  82. make_deflate_tables
  83. find_deflate_index
  84. deflate_encode
  85. deflate_decode
  86. lzss_encode
  87. lzss_encode_fast
  88. lzss_encode_fast_symbolic
  89. lzss_decode
  90. lzss_encode_symbolic
  91. lzss_decode_symbolic
  92. lzss_compress_symbolic
  93. lzss_decompress_symbolic
  94. lz77_encode
  95. lz77_decode
  96. lz77_encode_symbolic
  97. lz77_decode_symbolic
  98. lz77_compress
  99. lz77_decompress
  100. lz77_compress_symbolic
  101. lz77_decompress_symbolic
  102. lzb_compress
  103. lzb_decompress
  104. lz4_compress
  105. lz4_decompress
  106. ac_encode
  107. ac_decode
  108. create_ac_entry
  109. decode_ac_entry
  110. adaptive_ac_encode
  111. adaptive_ac_decode
  112. create_adaptive_ac_entry
  113. decode_adaptive_ac_entry
  114. abc_encode
  115. abc_decode
  116. fibonacci_encode
  117. fibonacci_decode
  118. elias_gamma_encode
  119. elias_gamma_decode
  120. elias_omega_encode
  121. elias_omega_decode
  122. obh_encode
  123. obh_decode
  124. lzw_encode
  125. lzw_decode
  126. lzw_compress
  127. lzw_decompress
  128. )
  129. ]
  130. );
  131. our @EXPORT_OK = (@{$EXPORT_TAGS{'all'}}, '$VERBOSE', '$LZ_MAX_CHAIN_LEN', '$LZ_MIN_LEN', '$LZ_MAX_LEN', '$LZ_MAX_DIST');
  132. our @EXPORT;
  133. ##########################
  134. # Misc low-level functions
  135. ##########################
  136. sub read_bit ($fh, $bitstring) {
  137. if (($$bitstring // '') eq '') {
  138. $$bitstring = unpack('b*', getc($fh) // confess "can't read bit");
  139. }
  140. chop($$bitstring);
  141. }
  142. sub read_bit_lsb ($fh, $bitstring) {
  143. if (($$bitstring // '') eq '') {
  144. $$bitstring = unpack('B*', getc($fh) // confess "can't read bit");
  145. }
  146. chop($$bitstring);
  147. }
  148. sub read_bits ($fh, $bits_len) {
  149. read($fh, (my $data), $bits_len >> 3) // confess "Read error: $!";
  150. $data = unpack('B*', $data);
  151. while (length($data) < $bits_len) {
  152. $data .= unpack('B*', getc($fh) // confess "can't read bits");
  153. }
  154. if (length($data) > $bits_len) {
  155. $data = substr($data, 0, $bits_len);
  156. }
  157. return $data;
  158. }
  159. sub read_bits_lsb ($fh, $bits_len) {
  160. read($fh, (my $data), $bits_len >> 3) // confess "Read error: $!";
  161. $data = unpack('b*', $data);
  162. while (length($data) < $bits_len) {
  163. $data .= unpack('b*', getc($fh) // confess "can't read bits");
  164. }
  165. if (length($data) > $bits_len) {
  166. $data = substr($data, 0, $bits_len);
  167. }
  168. return $data;
  169. }
  170. sub int2bits ($value, $size) {
  171. sprintf("%0*b", $size, $value);
  172. }
  173. sub int2bits_lsb ($value, $size) {
  174. scalar reverse sprintf("%0*b", $size, $value);
  175. }
  176. sub int2bytes ($value, $size) {
  177. pack('B*', sprintf("%0*b", 8 * $size, $value));
  178. }
  179. sub int2bytes_lsb ($value, $size) {
  180. pack('b*', scalar reverse sprintf("%0*b", 8 * $size, $value));
  181. }
  182. sub bytes2int($fh, $n) {
  183. if (ref($fh) eq '') {
  184. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  185. return __SUB__->($fh2, $n);
  186. }
  187. my $bytes = '';
  188. $bytes .= getc($fh) for (1 .. $n);
  189. oct('0b' . unpack('B*', $bytes));
  190. }
  191. sub bytes2int_lsb ($fh, $n) {
  192. if (ref($fh) eq '') {
  193. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  194. return __SUB__->($fh2, $n);
  195. }
  196. my $bytes = '';
  197. $bytes .= getc($fh) for (1 .. $n);
  198. oct('0b' . reverse unpack('b*', $bytes));
  199. }
  200. sub bits2int ($fh, $size, $buffer) {
  201. if ($size % 8 == 0 and ($$buffer // '') eq '') { # optimization
  202. return bytes2int($fh, $size >> 3);
  203. }
  204. my $bitstring = '0b';
  205. for (1 .. $size) {
  206. $bitstring .= ($$buffer // '') eq '' ? read_bit($fh, $buffer) : chop($$buffer);
  207. }
  208. oct($bitstring);
  209. }
  210. sub bits2int_lsb ($fh, $size, $buffer) {
  211. if ($size % 8 == 0 and ($$buffer // '') eq '') { # optimization
  212. return bytes2int_lsb($fh, $size >> 3);
  213. }
  214. my $bitstring = '';
  215. for (1 .. $size) {
  216. $bitstring .= ($$buffer // '') eq '' ? read_bit_lsb($fh, $buffer) : chop($$buffer);
  217. }
  218. oct('0b' . reverse($bitstring));
  219. }
  220. sub string2symbols ($string) {
  221. [unpack('C*', $string)];
  222. }
  223. sub symbols2string ($symbols) {
  224. pack('C*', @$symbols);
  225. }
  226. sub read_null_terminated ($fh) {
  227. my $string = '';
  228. while (1) {
  229. my $c = getc($fh) // confess "can't read character";
  230. last if $c eq "\0";
  231. $string .= $c;
  232. }
  233. return $string;
  234. }
  235. sub frequencies ($symbols) {
  236. my %freq;
  237. ++$freq{$_} for @$symbols;
  238. return \%freq;
  239. }
  240. sub deltas ($integers) {
  241. my @deltas;
  242. my $prev = 0;
  243. foreach my $n (@$integers) {
  244. push @deltas, $n - $prev;
  245. $prev = $n;
  246. }
  247. return \@deltas;
  248. }
  249. sub accumulate ($deltas) {
  250. my @acc;
  251. my $prev = 0;
  252. foreach my $d (@$deltas) {
  253. $prev += $d;
  254. push @acc, $prev;
  255. }
  256. return \@acc;
  257. }
  258. ########################
  259. # Fibonacci Coding
  260. ########################
  261. sub fibonacci_encode ($symbols) {
  262. my $bitstring = '';
  263. foreach my $n (scalar(@$symbols), @$symbols) {
  264. my ($f1, $f2, $f3) = (0, 1, 1);
  265. my ($rn, $s, $k) = ($n + 1, '', 2);
  266. for (; $f3 <= $rn ; ++$k) {
  267. ($f1, $f2, $f3) = ($f2, $f3, $f2 + $f3);
  268. }
  269. foreach my $i (1 .. $k - 2) {
  270. ($f3, $f2, $f1) = ($f2, $f1, $f2 - $f1);
  271. if ($f3 <= $rn) {
  272. $rn -= $f3;
  273. $s .= '1';
  274. }
  275. else {
  276. $s .= '0';
  277. }
  278. }
  279. $bitstring .= reverse($s) . '1';
  280. }
  281. pack('B*', $bitstring);
  282. }
  283. sub fibonacci_decode ($fh) {
  284. if (ref($fh) eq '') {
  285. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  286. return __SUB__->($fh2);
  287. }
  288. my @symbols;
  289. my $enc = '';
  290. my $prev_bit = '0';
  291. my $len = 0;
  292. my $buffer = '';
  293. for (my $k = 0 ; $k <= $len ;) {
  294. my $bit = read_bit($fh, \$buffer);
  295. if ($bit eq '1' and $prev_bit eq '1') {
  296. my ($value, $f1, $f2) = (0, 1, 1);
  297. foreach my $bit (split //, $enc) {
  298. $value += $f2 if $bit;
  299. ($f1, $f2) = ($f2, $f1 + $f2);
  300. }
  301. push @symbols, $value - 1;
  302. $len = pop @symbols if (++$k == 1);
  303. $enc = '';
  304. $prev_bit = '0';
  305. }
  306. else {
  307. $enc .= $bit;
  308. $prev_bit = $bit;
  309. }
  310. }
  311. return \@symbols;
  312. }
  313. #######################################
  314. # Adaptive Binary Concatenation method
  315. #######################################
  316. sub abc_encode ($integers) {
  317. my @counts;
  318. my $count = 0;
  319. my $bits_width = 1;
  320. my $bits_max_symbol = 1 << $bits_width;
  321. my $processed_len = 0;
  322. foreach my $k (@$integers) {
  323. while ($k >= $bits_max_symbol) {
  324. if ($count > 0) {
  325. push @counts, [$bits_width, $count];
  326. $processed_len += $count;
  327. }
  328. $count = 0;
  329. $bits_max_symbol *= 2;
  330. $bits_width += 1;
  331. }
  332. ++$count;
  333. }
  334. push @counts, grep { $_->[1] > 0 } [$bits_width, scalar(@$integers) - $processed_len];
  335. $VERBOSE && say STDERR "Bit sizes: ", join(' ', map { $_->[0] } @counts);
  336. $VERBOSE && say STDERR "Lengths : ", join(' ', map { $_->[1] } @counts);
  337. $VERBOSE && say STDERR '';
  338. my $compressed = fibonacci_encode([(map { $_->[0] } @counts), (map { $_->[1] } @counts)]);
  339. my $bits = '';
  340. my @ints = @$integers;
  341. foreach my $pair (@counts) {
  342. my ($blen, $len) = @$pair;
  343. foreach my $symbol (splice(@ints, 0, $len)) {
  344. $bits .= sprintf("%0*b", $blen, $symbol);
  345. }
  346. }
  347. $compressed .= pack('B*', $bits);
  348. return $compressed;
  349. }
  350. sub abc_decode ($fh) {
  351. if (ref($fh) eq '') {
  352. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  353. return __SUB__->($fh2);
  354. }
  355. my $ints = fibonacci_decode($fh);
  356. my $half = scalar(@$ints) >> 1;
  357. my @counts;
  358. foreach my $i (0 .. ($half - 1)) {
  359. push @counts, [$ints->[$i], $ints->[$half + $i]];
  360. }
  361. my $bits_len = 0;
  362. foreach my $pair (@counts) {
  363. my ($blen, $len) = @$pair;
  364. $bits_len += $blen * $len;
  365. }
  366. my $bits = read_bits($fh, $bits_len);
  367. my @integers;
  368. foreach my $pair (@counts) {
  369. my ($blen, $len) = @$pair;
  370. foreach my $chunk (unpack(sprintf('(a%d)*', $blen), substr($bits, 0, $blen * $len, ''))) {
  371. push @integers, oct('0b' . $chunk);
  372. }
  373. }
  374. return \@integers;
  375. }
  376. ###################################
  377. # Arithmetic Coding (in fixed bits)
  378. ###################################
  379. sub _create_cfreq ($freq) {
  380. my @cf;
  381. my $T = 0;
  382. foreach my $i (sort { $a <=> $b } keys %$freq) {
  383. $freq->{$i} // next;
  384. $cf[$i] = $T;
  385. $T += $freq->{$i};
  386. $cf[$i + 1] = $T;
  387. }
  388. return (\@cf, $T);
  389. }
  390. sub ac_encode ($symbols) {
  391. if (ref($symbols) eq '') {
  392. $symbols = string2symbols($symbols);
  393. }
  394. my $enc = '';
  395. my $EOF_SYMBOL = (max(@$symbols) // 0) + 1;
  396. my @bytes = (@$symbols, $EOF_SYMBOL);
  397. my $freq = frequencies(\@bytes);
  398. my ($cf, $T) = _create_cfreq($freq);
  399. if ($T > MAX) {
  400. confess "Too few bits: $T > ${\MAX}";
  401. }
  402. my $low = 0;
  403. my $high = MAX;
  404. my $uf_count = 0;
  405. foreach my $c (@bytes) {
  406. my $w = $high - $low + 1;
  407. $high = ($low + int(($w * $cf->[$c + 1]) / $T) - 1) & MAX;
  408. $low = ($low + int(($w * $cf->[$c]) / $T)) & MAX;
  409. if ($high > MAX) {
  410. confess "high > MAX: $high > ${\MAX}";
  411. }
  412. if ($low >= $high) { confess "$low >= $high" }
  413. while (1) {
  414. if (($high >> (BITS - 1)) == ($low >> (BITS - 1))) {
  415. my $bit = $high >> (BITS - 1);
  416. $enc .= $bit;
  417. if ($uf_count > 0) {
  418. $enc .= join('', 1 - $bit) x $uf_count;
  419. $uf_count = 0;
  420. }
  421. $low <<= 1;
  422. ($high <<= 1) |= 1;
  423. }
  424. elsif (((($low >> (BITS - 2)) & 0x1) == 1) && ((($high >> (BITS - 2)) & 0x1) == 0)) {
  425. ($high <<= 1) |= (1 << (BITS - 1));
  426. $high |= 1;
  427. ($low <<= 1) &= ((1 << (BITS - 1)) - 1);
  428. ++$uf_count;
  429. }
  430. else {
  431. last;
  432. }
  433. $low &= MAX;
  434. $high &= MAX;
  435. }
  436. }
  437. $enc .= '0';
  438. $enc .= '1';
  439. while (length($enc) % 8 != 0) {
  440. $enc .= '1';
  441. }
  442. return ($enc, $freq);
  443. }
  444. sub ac_decode ($fh, $freq) {
  445. if (ref($fh) eq '') {
  446. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  447. return __SUB__->($fh2, $freq);
  448. }
  449. my ($cf, $T) = _create_cfreq($freq);
  450. my @dec;
  451. my $low = 0;
  452. my $high = MAX;
  453. my $enc = oct('0b' . join '', map { getc($fh) // 1 } 1 .. BITS);
  454. my @table;
  455. foreach my $i (sort { $a <=> $b } keys %$freq) {
  456. foreach my $j ($cf->[$i] .. $cf->[$i + 1] - 1) {
  457. $table[$j] = $i;
  458. }
  459. }
  460. my $EOF_SYMBOL = max(keys %$freq) // 0;
  461. while (1) {
  462. my $w = $high - $low + 1;
  463. my $ss = int((($T * ($enc - $low + 1)) - 1) / $w);
  464. my $i = $table[$ss] // last;
  465. last if ($i == $EOF_SYMBOL);
  466. push @dec, $i;
  467. $high = ($low + int(($w * $cf->[$i + 1]) / $T) - 1) & MAX;
  468. $low = ($low + int(($w * $cf->[$i]) / $T)) & MAX;
  469. if ($high > MAX) {
  470. confess "error";
  471. }
  472. if ($low >= $high) { confess "$low >= $high" }
  473. while (1) {
  474. if (($high >> (BITS - 1)) == ($low >> (BITS - 1))) {
  475. ($high <<= 1) |= 1;
  476. $low <<= 1;
  477. ($enc <<= 1) |= (getc($fh) // 1);
  478. }
  479. elsif (((($low >> (BITS - 2)) & 0x1) == 1) && ((($high >> (BITS - 2)) & 0x1) == 0)) {
  480. ($high <<= 1) |= (1 << (BITS - 1));
  481. $high |= 1;
  482. ($low <<= 1) &= ((1 << (BITS - 1)) - 1);
  483. $enc = (($enc >> (BITS - 1)) << (BITS - 1)) | (($enc & ((1 << (BITS - 2)) - 1)) << 1) | (getc($fh) // 1);
  484. }
  485. else {
  486. last;
  487. }
  488. $low &= MAX;
  489. $high &= MAX;
  490. $enc &= MAX;
  491. }
  492. }
  493. return \@dec;
  494. }
  495. #############################################
  496. # Adaptive Arithemtic Coding (in fixed bits)
  497. #############################################
  498. sub _create_adaptive_cfreq ($freq_value, $alphabet_size) {
  499. my $T = 0;
  500. my (@cf, @freq);
  501. foreach my $i (0 .. $alphabet_size) {
  502. $freq[$i] = $freq_value;
  503. $cf[$i] = $T;
  504. $T += $freq_value;
  505. $cf[$i + 1] = $T;
  506. }
  507. return (\@freq, \@cf, $T);
  508. }
  509. sub _increment_freq ($c, $alphabet_size, $freq, $cf) {
  510. ++$freq->[$c];
  511. my $T = $cf->[$c];
  512. foreach my $i ($c .. $alphabet_size) {
  513. $cf->[$i] = $T;
  514. $T += $freq->[$i];
  515. $cf->[$i + 1] = $T;
  516. }
  517. return $T;
  518. }
  519. sub adaptive_ac_encode ($symbols) {
  520. if (ref($symbols) eq '') {
  521. $symbols = string2symbols($symbols);
  522. }
  523. my $enc = '';
  524. my @alphabet = sort { $a <=> $b } uniq(@$symbols);
  525. my $EOF_SYMBOL = scalar(@alphabet) ? ($alphabet[-1] + 1) : 1;
  526. push @alphabet, $EOF_SYMBOL;
  527. my $alphabet_size = $#alphabet;
  528. my ($freq, $cf, $T) = _create_adaptive_cfreq(INITIAL_FREQ, $alphabet_size);
  529. my %table;
  530. @table{@alphabet} = (0 .. $alphabet_size);
  531. if ($T > MAX) {
  532. confess "Too few bits: $T > ${\MAX}";
  533. }
  534. my $low = 0;
  535. my $high = MAX;
  536. my $uf_count = 0;
  537. foreach my $value (@$symbols, $EOF_SYMBOL) {
  538. my $c = $table{$value};
  539. my $w = $high - $low + 1;
  540. $high = ($low + int(($w * $cf->[$c + 1]) / $T) - 1) & MAX;
  541. $low = ($low + int(($w * $cf->[$c]) / $T)) & MAX;
  542. $T = _increment_freq($c, $alphabet_size, $freq, $cf);
  543. if ($high > MAX) {
  544. confess "high > MAX: $high > ${\MAX}";
  545. }
  546. if ($low >= $high) { confess "$low >= $high" }
  547. while (1) {
  548. if (($high >> (BITS - 1)) == ($low >> (BITS - 1))) {
  549. my $bit = $high >> (BITS - 1);
  550. $enc .= $bit;
  551. if ($uf_count > 0) {
  552. $enc .= join('', 1 - $bit) x $uf_count;
  553. $uf_count = 0;
  554. }
  555. $low <<= 1;
  556. ($high <<= 1) |= 1;
  557. }
  558. elsif (((($low >> (BITS - 2)) & 0x1) == 1) && ((($high >> (BITS - 2)) & 0x1) == 0)) {
  559. ($high <<= 1) |= (1 << (BITS - 1));
  560. $high |= 1;
  561. ($low <<= 1) &= ((1 << (BITS - 1)) - 1);
  562. ++$uf_count;
  563. }
  564. else {
  565. last;
  566. }
  567. $low &= MAX;
  568. $high &= MAX;
  569. }
  570. }
  571. $enc .= '0';
  572. $enc .= '1';
  573. while (length($enc) % 8 != 0) {
  574. $enc .= '1';
  575. }
  576. return ($enc, \@alphabet);
  577. }
  578. sub adaptive_ac_decode ($fh, $alphabet) {
  579. if (ref($fh) eq '') {
  580. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  581. return __SUB__->($fh2, $alphabet);
  582. }
  583. my @dec;
  584. my $low = 0;
  585. my $high = MAX;
  586. my $alphabet_size = $#{$alphabet};
  587. my ($freq, $cf, $T) = _create_adaptive_cfreq(INITIAL_FREQ, $alphabet_size);
  588. my $enc = oct('0b' . join '', map { getc($fh) // 1 } 1 .. BITS);
  589. while (1) {
  590. my $w = ($high + 1) - $low;
  591. my $ss = int((($T * ($enc - $low + 1)) - 1) / $w);
  592. my $i = 0;
  593. foreach my $j (0 .. $alphabet_size) {
  594. if ($cf->[$j] <= $ss and $ss < $cf->[$j + 1]) {
  595. $i = $j;
  596. last;
  597. }
  598. }
  599. last if ($i == $alphabet_size);
  600. push @dec, $alphabet->[$i];
  601. $high = ($low + int(($w * $cf->[$i + 1]) / $T) - 1) & MAX;
  602. $low = ($low + int(($w * $cf->[$i]) / $T)) & MAX;
  603. $T = _increment_freq($i, $alphabet_size, $freq, $cf);
  604. if ($high > MAX) {
  605. confess "high > MAX: ($high > ${\MAX})";
  606. }
  607. if ($low >= $high) { confess "$low >= $high" }
  608. while (1) {
  609. if (($high >> (BITS - 1)) == ($low >> (BITS - 1))) {
  610. ($high <<= 1) |= 1;
  611. $low <<= 1;
  612. ($enc <<= 1) |= (getc($fh) // 1);
  613. }
  614. elsif (((($low >> (BITS - 2)) & 0x1) == 1) && ((($high >> (BITS - 2)) & 0x1) == 0)) {
  615. ($high <<= 1) |= (1 << (BITS - 1));
  616. $high |= 1;
  617. ($low <<= 1) &= ((1 << (BITS - 1)) - 1);
  618. $enc = (($enc >> (BITS - 1)) << (BITS - 1)) | (($enc & ((1 << (BITS - 2)) - 1)) << 1) | (getc($fh) // 1);
  619. }
  620. else {
  621. last;
  622. }
  623. $low &= MAX;
  624. $high &= MAX;
  625. $enc &= MAX;
  626. }
  627. }
  628. return \@dec;
  629. }
  630. #####################
  631. # Generic run-length
  632. #####################
  633. sub run_length ($arr, $max_run = undef) {
  634. @$arr || return [];
  635. my @result = [$arr->[0], 1];
  636. my $prev_value = $arr->[0];
  637. foreach my $i (1 .. $#$arr) {
  638. my $curr_value = $arr->[$i];
  639. if ($curr_value == $prev_value and (defined($max_run) ? $result[-1][1] < $max_run : 1)) {
  640. ++$result[-1][1];
  641. }
  642. else {
  643. push(@result, [$curr_value, 1]);
  644. }
  645. $prev_value = $curr_value;
  646. }
  647. return \@result;
  648. }
  649. ######################################
  650. # Binary variable run-length encoding
  651. ######################################
  652. sub binary_vrl_encode ($bitstring) {
  653. my @bits = split(//, $bitstring);
  654. my $encoded = $bits[0];
  655. foreach my $rle (@{run_length(\@bits)}) {
  656. my ($c, $v) = @$rle;
  657. if ($v == 1) {
  658. $encoded .= '0';
  659. }
  660. else {
  661. my $t = sprintf('%b', $v - 1);
  662. $encoded .= join('', '1' x length($t), '0', substr($t, 1));
  663. }
  664. }
  665. return $encoded;
  666. }
  667. sub binary_vrl_decode ($bitstring) {
  668. my $decoded = '';
  669. my $bit = substr($bitstring, 0, 1, '');
  670. while ($bitstring ne '') {
  671. $decoded .= $bit;
  672. my $bl = 0;
  673. while (substr($bitstring, 0, 1, '') eq '1') {
  674. ++$bl;
  675. }
  676. if ($bl > 0) {
  677. $decoded .= $bit x oct('0b1' . join('', map { substr($bitstring, 0, 1, '') } 1 .. $bl - 1));
  678. }
  679. $bit = ($bit eq '1' ? '0' : '1');
  680. }
  681. return $decoded;
  682. }
  683. ############################
  684. # Burrows-Wheeler transform
  685. ############################
  686. sub bwt_sort ($s, $LOOKAHEAD_LEN = 128) { # O(n * LOOKAHEAD_LEN) space (fast)
  687. #<<<
  688. [
  689. map { $_->[1] } sort {
  690. ($a->[0] cmp $b->[0])
  691. || ((substr($s, $a->[1]) . substr($s, 0, $a->[1])) cmp (substr($s, $b->[1]) . substr($s, 0, $b->[1])))
  692. }
  693. map {
  694. my $t = substr($s, $_, $LOOKAHEAD_LEN);
  695. if (length($t) < $LOOKAHEAD_LEN) {
  696. $t .= substr($s, 0, ($_ < $LOOKAHEAD_LEN) ? $_ : ($LOOKAHEAD_LEN - length($t)));
  697. }
  698. [$t, $_]
  699. } 0 .. length($s) - 1
  700. ];
  701. #>>>
  702. }
  703. sub bwt_encode ($s, $LOOKAHEAD_LEN = 128) {
  704. if (ref($s) ne '') {
  705. return bwt_encode_symbolic($s);
  706. }
  707. my $bwt = bwt_sort($s, $LOOKAHEAD_LEN);
  708. my $ret = join('', map { substr($s, $_ - 1, 1) } @$bwt);
  709. my $idx = 0;
  710. foreach my $i (@$bwt) {
  711. $i || last;
  712. ++$idx;
  713. }
  714. return ($ret, $idx);
  715. }
  716. sub bwt_decode ($bwt, $idx) { # fast inversion
  717. my @tail = split(//, $bwt);
  718. my @head = sort @tail;
  719. my %indices;
  720. foreach my $i (0 .. $#tail) {
  721. push @{$indices{$tail[$i]}}, $i;
  722. }
  723. my @table;
  724. foreach my $v (@head) {
  725. push @table, shift(@{$indices{$v}});
  726. }
  727. my $dec = '';
  728. my $i = $idx;
  729. for (1 .. scalar(@head)) {
  730. $dec .= $head[$i];
  731. $i = $table[$i];
  732. }
  733. return $dec;
  734. }
  735. ##############################################
  736. # Burrows-Wheeler transform (symbolic variant)
  737. ##############################################
  738. sub bwt_sort_symbolic ($s) { # O(n) space (slowish)
  739. my @cyclic = @$s;
  740. my $len = scalar(@cyclic);
  741. my $rle = 1;
  742. foreach my $i (1 .. $len - 1) {
  743. if ($cyclic[$i] != $cyclic[$i - 1]) {
  744. $rle = 0;
  745. last;
  746. }
  747. }
  748. $rle && return [0 .. $len - 1];
  749. [
  750. sort {
  751. my ($i, $j) = ($a, $b);
  752. while ($cyclic[$i] == $cyclic[$j]) {
  753. $i %= $len if (++$i >= $len);
  754. $j %= $len if (++$j >= $len);
  755. }
  756. $cyclic[$i] <=> $cyclic[$j];
  757. } 0 .. $len - 1
  758. ];
  759. }
  760. sub bwt_encode_symbolic ($symbols) {
  761. if (ref($symbols) eq '') {
  762. $symbols = string2symbols($symbols);
  763. }
  764. my $bwt = bwt_sort_symbolic($symbols);
  765. my @ret = map { $symbols->[$_ - 1] } @$bwt;
  766. my $idx = 0;
  767. foreach my $i (@$bwt) {
  768. $i || last;
  769. ++$idx;
  770. }
  771. return (\@ret, $idx);
  772. }
  773. sub bwt_decode_symbolic ($bwt, $idx) { # fast inversion
  774. my @head = sort { $a <=> $b } @$bwt;
  775. my %indices;
  776. foreach my $i (0 .. $#head) {
  777. push @{$indices{$bwt->[$i]}}, $i;
  778. }
  779. my @table;
  780. foreach my $v (@head) {
  781. push @table, shift(@{$indices{$v}});
  782. }
  783. my @dec;
  784. my $i = $idx;
  785. for (1 .. scalar(@head)) {
  786. push @dec, $head[$i];
  787. $i = $table[$i];
  788. }
  789. return \@dec;
  790. }
  791. #####################
  792. # RLE4 used in Bzip2
  793. #####################
  794. sub rle4_encode ($symbols, $max_run = 255) { # RLE1
  795. if (ref($symbols) eq '') {
  796. $symbols = string2symbols($symbols);
  797. }
  798. my $end = $#{$symbols};
  799. return [] if ($end < 0);
  800. my $prev = $symbols->[0];
  801. my $run = 1;
  802. my @rle = ($prev);
  803. for (my $i = 1 ; $i <= $end ; ++$i) {
  804. if ($symbols->[$i] == $prev) {
  805. ++$run;
  806. }
  807. else {
  808. $run = 1;
  809. $prev = $symbols->[$i];
  810. }
  811. push @rle, $prev;
  812. if ($run >= 4) {
  813. $run = 0;
  814. $i += 1;
  815. while ($run < $max_run and $i <= $end and $symbols->[$i] == $prev) {
  816. ++$run;
  817. ++$i;
  818. }
  819. push @rle, $run;
  820. $run = 1;
  821. if ($i <= $end) {
  822. $prev = $symbols->[$i];
  823. push @rle, $symbols->[$i];
  824. }
  825. }
  826. }
  827. return \@rle;
  828. }
  829. sub rle4_decode ($symbols) { # RLE1
  830. if (ref($symbols) eq '') {
  831. $symbols = string2symbols($symbols);
  832. }
  833. my $end = $#{$symbols};
  834. return [] if ($end < 0);
  835. my @dec = $symbols->[0];
  836. my $prev = $symbols->[0];
  837. my $run = 1;
  838. for (my $i = 1 ; $i <= $end ; ++$i) {
  839. if ($symbols->[$i] == $prev) {
  840. ++$run;
  841. }
  842. else {
  843. $run = 1;
  844. $prev = $symbols->[$i];
  845. }
  846. push @dec, $prev;
  847. if ($run >= 4) {
  848. if (++$i <= $end) {
  849. $run = $symbols->[$i];
  850. push @dec, (($prev) x $run);
  851. }
  852. $run = 0;
  853. }
  854. }
  855. return \@dec;
  856. }
  857. #######################
  858. # Delta encoding (+RLE)
  859. #######################
  860. sub _compute_elias_costs ($run_length) {
  861. # Check which method results in better compression
  862. my $with_rle = 0;
  863. my $without_rle = 0;
  864. my $double_with_rle = 0;
  865. my $double_without_rle = 0;
  866. # Check if there are any negative values or zero values
  867. my $has_negative = 0;
  868. my $has_zero = 0;
  869. foreach my $pair (@$run_length) {
  870. my ($c, $v) = @$pair;
  871. if ($c < 0 and not $has_negative) {
  872. $has_negative = 1;
  873. }
  874. if ($c == 0) {
  875. $with_rle += 1;
  876. $double_with_rle += 1;
  877. $without_rle += $v;
  878. $double_without_rle += $v;
  879. $has_zero ||= 1;
  880. }
  881. else {
  882. { # double
  883. my $t = int(log(abs($c) + 1) / log(2) + 1);
  884. my $l = int(log($t) / log(2) + 1);
  885. my $len = 2 * ($l - 1) + ($t - 1) + 3;
  886. $double_with_rle += $len;
  887. $double_without_rle += $len * $v;
  888. }
  889. { # single
  890. my $t = int(log(abs($c) + 1) / log(2) + 1);
  891. my $len = 2 * ($t - 1) + 3;
  892. $with_rle += $len;
  893. $without_rle += $len * $v;
  894. }
  895. }
  896. if ($v == 1) {
  897. $with_rle += 1;
  898. $double_with_rle += 1;
  899. }
  900. else {
  901. my $t = int(log($v) / log(2) + 1);
  902. my $len = 2 * ($t - 1) + 1;
  903. $with_rle += $len;
  904. $double_with_rle += $len;
  905. }
  906. }
  907. scalar {
  908. has_negative => $has_negative,
  909. has_zero => $has_zero,
  910. methods => {
  911. with_rle => $with_rle,
  912. without_rle => $without_rle,
  913. double_with_rle => $double_with_rle,
  914. double_without_rle => $double_without_rle,
  915. },
  916. };
  917. }
  918. sub _find_best_encoding_method ($integers) {
  919. my $rl = run_length($integers);
  920. my $costs = _compute_elias_costs($rl);
  921. my ($best_method) = sort { $costs->{methods}{$a} <=> $costs->{methods}{$b} } sort keys(%{$costs->{methods}});
  922. $VERBOSE && say STDERR "$best_method --> $costs->{methods}{$best_method}";
  923. return ($rl, $best_method, $costs);
  924. }
  925. sub delta_encode ($integers) {
  926. my $deltas = deltas($integers);
  927. my @methods = (
  928. [_find_best_encoding_method($integers), 0, 0],
  929. [_find_best_encoding_method($deltas), 1, 0],
  930. [_find_best_encoding_method(rle4_encode($integers, scalar(@$integers) + 1)), 0, 1],
  931. [_find_best_encoding_method(rle4_encode($deltas, scalar(@$integers) + 1)), 1, 1],
  932. );
  933. my ($best) = sort { $a->[2]{methods}{$a->[1]} <=> $b->[2]{methods}{$b->[1]} } @methods;
  934. my ($rl, $method, $stats, $with_deltas, $with_rle4) = @$best;
  935. my $double = 0;
  936. my $with_rle = 0;
  937. my $has_negative = $stats->{has_negative};
  938. if ($method eq 'with_rle') {
  939. $with_rle = 1;
  940. }
  941. elsif ($method eq 'without_rle') {
  942. ## ok
  943. }
  944. elsif ($method eq 'double_with_rle') {
  945. $with_rle = 1;
  946. $double = 1;
  947. }
  948. elsif ($method eq 'double_without_rle') {
  949. $double = 1;
  950. }
  951. else {
  952. confess "[BUG] Unknown encoding method: $method";
  953. }
  954. my $code = '';
  955. my $bitstring = join('', $double, $with_rle, $has_negative, $with_deltas, $with_rle4);
  956. my $length = sum(map { $_->[1] } @$rl) // 0;
  957. foreach my $pair ([$length, 1], @$rl) {
  958. my ($d, $v) = @$pair;
  959. if ($d == 0) {
  960. $code = '0';
  961. }
  962. elsif ($double) {
  963. my $t = sprintf('%b', abs($d) + 1);
  964. my $l = sprintf('%b', length($t));
  965. $code = ($has_negative ? ('1' . (($d < 0) ? '0' : '1')) : '') . ('1' x (length($l) - 1)) . '0' . substr($l, 1) . substr($t, 1);
  966. }
  967. else {
  968. my $t = sprintf('%b', abs($d) + ($has_negative ? 0 : 1));
  969. $code = ($has_negative ? ('1' . (($d < 0) ? '0' : '1')) : '') . ('1' x (length($t) - 1)) . '0' . substr($t, 1);
  970. }
  971. $bitstring .= $code;
  972. if (not $with_rle) {
  973. if ($v > 1) {
  974. $bitstring .= $code x ($v - 1);
  975. }
  976. next;
  977. }
  978. if ($v == 1) {
  979. $bitstring .= '0';
  980. }
  981. else {
  982. my $t = sprintf('%b', $v);
  983. $bitstring .= join('', '1' x (length($t) - 1), '0', substr($t, 1));
  984. }
  985. }
  986. pack('B*', $bitstring);
  987. }
  988. sub delta_decode ($fh) {
  989. if (ref($fh) eq '') {
  990. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  991. return __SUB__->($fh2);
  992. }
  993. my $buffer = '';
  994. my $double = read_bit($fh, \$buffer);
  995. my $with_rle = read_bit($fh, \$buffer);
  996. my $has_negative = read_bit($fh, \$buffer);
  997. my $with_deltas = read_bit($fh, \$buffer);
  998. my $with_rle4 = read_bit($fh, \$buffer);
  999. my @deltas;
  1000. my $len = 0;
  1001. for (my $k = 0 ; $k <= $len ; ++$k) {
  1002. my $bit = read_bit($fh, \$buffer);
  1003. if ($bit eq '0') {
  1004. push @deltas, 0;
  1005. }
  1006. elsif ($double) {
  1007. my $bit = $has_negative ? read_bit($fh, \$buffer) : 0;
  1008. my $bl = $has_negative ? 0 : 1;
  1009. ++$bl while (read_bit($fh, \$buffer) eq '1');
  1010. my $bl2 = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $bl));
  1011. my $int = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. ($bl2 - 1)));
  1012. push @deltas, ($has_negative ? ($bit eq '1' ? 1 : -1) : 1) * ($int - 1);
  1013. }
  1014. else {
  1015. my $bit = $has_negative ? read_bit($fh, \$buffer) : 0;
  1016. my $n = $has_negative ? 0 : 1;
  1017. ++$n while (read_bit($fh, \$buffer) eq '1');
  1018. my $d = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $n));
  1019. push @deltas, $has_negative ? ($bit eq '1' ? $d : -$d) : ($d - 1);
  1020. }
  1021. if ($with_rle) {
  1022. my $bl = 0;
  1023. while (read_bit($fh, \$buffer) == 1) {
  1024. ++$bl;
  1025. }
  1026. if ($bl > 0) {
  1027. my $run = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $bl)) - 1;
  1028. $k += $run;
  1029. push @deltas, ($deltas[-1]) x $run;
  1030. }
  1031. }
  1032. if ($k == 0) {
  1033. $len = pop(@deltas);
  1034. }
  1035. }
  1036. my $decoded = \@deltas;
  1037. $decoded = rle4_decode($decoded) if $with_rle4;
  1038. $decoded = accumulate($decoded) if $with_deltas;
  1039. return $decoded;
  1040. }
  1041. ################################
  1042. # Alphabet encoding (from Bzip2)
  1043. ################################
  1044. sub encode_alphabet_256 ($alphabet) {
  1045. my %table;
  1046. @table{@$alphabet} = ();
  1047. my $populated = 0;
  1048. my @marked;
  1049. for (my $i = 0 ; $i <= 255 ; $i += 16) {
  1050. my $enc = 0;
  1051. foreach my $j (0 .. 15) {
  1052. if (exists($table{$i + $j})) {
  1053. $enc |= 1 << $j;
  1054. }
  1055. }
  1056. $populated <<= 1;
  1057. if ($enc > 0) {
  1058. $populated |= 1;
  1059. push @marked, $enc;
  1060. }
  1061. }
  1062. my $bitstring = join('', map { int2bits_lsb($_, 16) } @marked);
  1063. $VERBOSE && say STDERR "Populated : ", sprintf('%016b', $populated);
  1064. $VERBOSE && say STDERR "Marked : @marked";
  1065. $VERBOSE && say STDERR "Bits len : ", length($bitstring);
  1066. my $encoded = '';
  1067. $encoded .= int2bytes($populated, 2);
  1068. $encoded .= pack('B*', $bitstring);
  1069. return $encoded;
  1070. }
  1071. sub decode_alphabet_256 ($fh) {
  1072. if (ref($fh) eq '') {
  1073. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  1074. return __SUB__->($fh2);
  1075. }
  1076. my @alphabet;
  1077. my $l1 = bytes2int($fh, 2);
  1078. for my $i (0 .. 15) {
  1079. if ($l1 & (0x8000 >> $i)) {
  1080. my $l2 = bytes2int($fh, 2);
  1081. for my $j (0 .. 15) {
  1082. if ($l2 & (0x8000 >> $j)) {
  1083. push @alphabet, 16 * $i + $j;
  1084. }
  1085. }
  1086. }
  1087. }
  1088. return \@alphabet;
  1089. }
  1090. sub encode_alphabet ($alphabet) {
  1091. my $max_symbol = $alphabet->[-1] // -1;
  1092. if ($max_symbol <= 255) {
  1093. my $delta = delta_encode($alphabet);
  1094. my $enc = encode_alphabet_256($alphabet);
  1095. if (length($delta) < length($enc)) {
  1096. return (chr(0) . $delta);
  1097. }
  1098. return (chr(1) . $enc);
  1099. }
  1100. return (chr(0) . delta_encode($alphabet));
  1101. }
  1102. sub decode_alphabet ($fh) {
  1103. if (ref($fh) eq '') {
  1104. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  1105. return __SUB__->($fh2);
  1106. }
  1107. if (ord(getc($fh) // confess "error") == 1) {
  1108. return decode_alphabet_256($fh);
  1109. }
  1110. return delta_decode($fh);
  1111. }
  1112. ##########################
  1113. # Move to front transform
  1114. ##########################
  1115. sub mtf_encode ($symbols, $alphabet = undef) {
  1116. if (ref($symbols) eq '') {
  1117. $symbols = string2symbols($symbols);
  1118. }
  1119. if (defined($alphabet) and ref($alphabet) eq '') {
  1120. $alphabet = string2symbols($alphabet);
  1121. }
  1122. my (@C, @table);
  1123. my @alphabet;
  1124. my @alphabet_copy;
  1125. my $return_alphabet = 0;
  1126. if (defined($alphabet)) {
  1127. @alphabet = @$alphabet;
  1128. }
  1129. else {
  1130. @alphabet = sort { $a <=> $b } uniq(@$symbols);
  1131. $return_alphabet = 1;
  1132. @alphabet_copy = @alphabet;
  1133. }
  1134. my $index;
  1135. my @indices = (0 .. $#alphabet);
  1136. foreach my $c (@$symbols) {
  1137. foreach my $i (@indices) {
  1138. if ($alphabet[$i] == $c) {
  1139. $index = $i;
  1140. last;
  1141. }
  1142. }
  1143. push @C, $index;
  1144. unshift(@alphabet, splice(@alphabet, $index, 1));
  1145. }
  1146. $return_alphabet || return \@C;
  1147. return (\@C, \@alphabet_copy);
  1148. }
  1149. sub mtf_decode ($encoded, $alphabet) {
  1150. if (ref($encoded) eq '') {
  1151. $encoded = string2symbols($encoded);
  1152. }
  1153. if (ref($alphabet) eq '') {
  1154. $alphabet = string2symbols($alphabet);
  1155. }
  1156. my @S;
  1157. my @alpha = @$alphabet;
  1158. foreach my $p (@$encoded) {
  1159. push @S, $alpha[$p];
  1160. unshift(@alpha, splice(@alpha, $p, 1));
  1161. }
  1162. return \@S;
  1163. }
  1164. ###########################
  1165. # Zero Run-length encoding
  1166. ###########################
  1167. sub zrle_encode ($symbols) { # RLE2
  1168. if (ref($symbols) eq '') {
  1169. $symbols = string2symbols($symbols);
  1170. }
  1171. my @rle;
  1172. my $end = $#{$symbols};
  1173. for (my $i = 0 ; $i <= $end ; ++$i) {
  1174. my $run = 0;
  1175. while ($i <= $end and $symbols->[$i] == 0) {
  1176. ++$run;
  1177. ++$i;
  1178. }
  1179. if ($run >= 1) {
  1180. my $t = sprintf('%b', $run + 1);
  1181. push @rle, split(//, substr($t, 1));
  1182. }
  1183. if ($i <= $end) {
  1184. push @rle, $symbols->[$i] + 1;
  1185. }
  1186. }
  1187. return \@rle;
  1188. }
  1189. sub zrle_decode ($rle) { # RLE2
  1190. if (ref($rle) eq '') {
  1191. $rle = string2symbols($rle);
  1192. }
  1193. my @dec;
  1194. my $end = $#{$rle};
  1195. for (my $i = 0 ; $i <= $end ; ++$i) {
  1196. my $k = $rle->[$i];
  1197. if ($k == 0 or $k == 1) {
  1198. my $run = 1;
  1199. while (($i <= $end) and ($k == 0 or $k == 1)) {
  1200. ($run <<= 1) |= $k;
  1201. $k = $rle->[++$i];
  1202. }
  1203. push @dec, (0) x ($run - 1);
  1204. }
  1205. if ($i <= $end) {
  1206. push @dec, $k - 1;
  1207. }
  1208. }
  1209. return \@dec;
  1210. }
  1211. ################################################################
  1212. # Move-to-front compression (MTF + RLE4 + ZRLE + Huffman coding)
  1213. ################################################################
  1214. sub mrl_compress_symbolic ($symbols, $entropy_sub = \&create_huffman_entry) {
  1215. if (ref($symbols) eq '') {
  1216. $symbols = string2symbols($symbols);
  1217. }
  1218. my ($mtf, $alphabet) = mtf_encode($symbols);
  1219. my $rle = zrle_encode($mtf);
  1220. my $rle4 = rle4_encode($rle, scalar(@$rle));
  1221. encode_alphabet($alphabet) . $entropy_sub->($rle4);
  1222. }
  1223. *mrl_compress = \&mrl_compress_symbolic;
  1224. sub mrl_decompress_symbolic ($fh, $entropy_sub = \&decode_huffman_entry) {
  1225. if (ref($fh) eq '') {
  1226. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  1227. return __SUB__->($fh2, $entropy_sub);
  1228. }
  1229. my $alphabet = decode_alphabet($fh);
  1230. $VERBOSE && say STDERR "Alphabet size: ", scalar(@$alphabet);
  1231. my $rle4 = $entropy_sub->($fh);
  1232. my $rle = rle4_decode($rle4);
  1233. my $mtf = zrle_decode($rle);
  1234. my $symbols = mtf_decode($mtf, $alphabet);
  1235. return $symbols;
  1236. }
  1237. sub mrl_decompress($fh, $entropy_sub = \&decode_huffman_entry) {
  1238. symbols2string(mrl_decompress_symbolic($fh, $entropy_sub));
  1239. }
  1240. ############################################################
  1241. # BWT-based compression (BWT + MTF + ZRLE + Huffman coding)
  1242. ############################################################
  1243. sub bwt_compress ($chunk, $entropy_sub = \&create_huffman_entry) {
  1244. if (ref($chunk) ne '') {
  1245. return bwt_compress_symbolic($chunk, $entropy_sub);
  1246. }
  1247. my $rle1 = rle4_encode(string2symbols($chunk));
  1248. my ($bwt, $idx) = bwt_encode(pack('C*', @$rle1));
  1249. $VERBOSE && say STDERR "BWT index = $idx";
  1250. my ($mtf, $alphabet) = mtf_encode(string2symbols($bwt));
  1251. my $rle = zrle_encode($mtf);
  1252. pack('N', $idx) . encode_alphabet($alphabet) . $entropy_sub->($rle);
  1253. }
  1254. sub bwt_decompress ($fh, $entropy_sub = \&decode_huffman_entry) {
  1255. if (ref($fh) eq '') {
  1256. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  1257. return __SUB__->($fh2, $entropy_sub);
  1258. }
  1259. my $idx = bytes2int($fh, 4);
  1260. my $alphabet = decode_alphabet($fh);
  1261. $VERBOSE && say STDERR "BWT index = $idx";
  1262. $VERBOSE && say STDERR "Alphabet size: ", scalar(@$alphabet);
  1263. my $rle = $entropy_sub->($fh);
  1264. my $mtf = zrle_decode($rle);
  1265. my $bwt = mtf_decode($mtf, $alphabet);
  1266. my $rle4 = bwt_decode(pack('C*', @$bwt), $idx);
  1267. my $data = rle4_decode(string2symbols($rle4));
  1268. pack('C*', @$data);
  1269. }
  1270. ###########################################
  1271. # BWT-based compression (symbolic variant)
  1272. ###########################################
  1273. sub bwt_compress_symbolic ($symbols, $entropy_sub = \&create_huffman_entry) {
  1274. if (ref($symbols) eq '') {
  1275. $symbols = string2symbols($symbols);
  1276. }
  1277. my $rle4 = rle4_encode($symbols);
  1278. my ($bwt, $idx) = bwt_encode_symbolic($rle4);
  1279. my ($mtf, $alphabet) = mtf_encode($bwt);
  1280. my $rle = zrle_encode($mtf);
  1281. $VERBOSE && say STDERR "BWT index = $idx";
  1282. $VERBOSE && say STDERR "Max symbol: ", max(@$alphabet) // 0;
  1283. pack('N', $idx) . encode_alphabet($alphabet) . $entropy_sub->($rle);
  1284. }
  1285. sub bwt_decompress_symbolic ($fh, $entropy_sub = \&decode_huffman_entry) {
  1286. if (ref($fh) eq '') {
  1287. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  1288. return __SUB__->($fh2, $entropy_sub);
  1289. }
  1290. my $idx = bytes2int($fh, 4);
  1291. my $alphabet = decode_alphabet($fh);
  1292. $VERBOSE && say STDERR "BWT index = $idx";
  1293. $VERBOSE && say STDERR "Alphabet size: ", scalar(@$alphabet);
  1294. my $rle = $entropy_sub->($fh);
  1295. my $mtf = zrle_decode($rle);
  1296. my $bwt = mtf_decode($mtf, $alphabet);
  1297. my $rle4 = bwt_decode_symbolic($bwt, $idx);
  1298. my $data = rle4_decode($rle4);
  1299. return $data;
  1300. }
  1301. ###########################
  1302. # Arithmetic Coding entries
  1303. ###########################
  1304. sub create_ac_entry ($symbols) {
  1305. if (ref($symbols) eq '') {
  1306. $symbols = string2symbols($symbols);
  1307. }
  1308. my ($enc, $freq) = ac_encode($symbols);
  1309. my $max_symbol = max(keys %$freq) // 0;
  1310. my @freqs;
  1311. foreach my $k (0 .. $max_symbol) {
  1312. push @freqs, $freq->{$k} // 0;
  1313. }
  1314. push @freqs, length($enc) >> 3;
  1315. delta_encode(\@freqs) . pack("B*", $enc);
  1316. }
  1317. sub decode_ac_entry ($fh) {
  1318. if (ref($fh) eq '') {
  1319. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  1320. return __SUB__->($fh2);
  1321. }
  1322. my @freqs = @{delta_decode($fh)};
  1323. my $bits_len = pop(@freqs);
  1324. my %freq;
  1325. foreach my $i (0 .. $#freqs) {
  1326. if ($freqs[$i]) {
  1327. $freq{$i} = $freqs[$i];
  1328. }
  1329. }
  1330. $VERBOSE && say STDERR "Encoded length: $bits_len";
  1331. my $bits = read_bits($fh, $bits_len << 3);
  1332. if ($bits_len > 0) {
  1333. open my $bits_fh, '<:raw', \$bits;
  1334. return ac_decode($bits_fh, \%freq);
  1335. }
  1336. return [];
  1337. }
  1338. ####################################
  1339. # Adaptive Arithmetic Coding entries
  1340. ####################################
  1341. sub create_adaptive_ac_entry ($symbols) {
  1342. if (ref($symbols) eq '') {
  1343. $symbols = string2symbols($symbols);
  1344. }
  1345. my ($enc, $alphabet) = adaptive_ac_encode($symbols);
  1346. delta_encode([@$alphabet, length($enc) >> 3]) . pack('B*', $enc);
  1347. }
  1348. sub decode_adaptive_ac_entry ($fh) {
  1349. if (ref($fh) eq '') {
  1350. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  1351. return __SUB__->($fh2);
  1352. }
  1353. my $alphabet = delta_decode($fh);
  1354. my $enc_len = pop(@$alphabet);
  1355. if ($enc_len > 0) {
  1356. my $bits = read_bits($fh, $enc_len << 3);
  1357. open my $bits_fh, '<:raw', \$bits;
  1358. return adaptive_ac_decode($bits_fh, $alphabet);
  1359. }
  1360. return [];
  1361. }
  1362. ###########################
  1363. # Huffman Coding algorithm
  1364. ###########################
  1365. sub huffman_encode ($symbols, $dict) {
  1366. join('', @{$dict}{@$symbols});
  1367. }
  1368. sub huffman_decode ($bits, $rev_dict) {
  1369. local $" = '|';
  1370. [
  1371. split(
  1372. ' ', $bits =~ s{(@{[
  1373. map { $_->[1] }
  1374. sort { $a->[0] <=> $b->[0] }
  1375. map { [length($_), $_] }
  1376. keys %$rev_dict]
  1377. })}{$rev_dict->{$1} }gr
  1378. )
  1379. ];
  1380. }
  1381. # produce encode and decode dictionary from a tree
  1382. sub _huffman_walk_tree ($node, $code, $h) {
  1383. my $c = $node->[0] // return $h;
  1384. if (ref $c) { __SUB__->($c->[$_], $code . $_, $h) for ('0', '1') }
  1385. else { $h->{$c} = $code }
  1386. return $h;
  1387. }
  1388. sub huffman_from_code_lengths ($code_lengths) {
  1389. # This algorithm is based on the pseudocode in RFC 1951 (Section 3.2.2)
  1390. # (Steps are numbered as in the RFC)
  1391. # Step 1
  1392. my $max_length = max(@$code_lengths) // 0;
  1393. my @length_counts = (0) x ($max_length + 1);
  1394. foreach my $length (@$code_lengths) {
  1395. ++$length_counts[$length];
  1396. }
  1397. # Step 2
  1398. my $code = 0;
  1399. $length_counts[0] = 0;
  1400. my @next_code = (0) x ($max_length + 1);
  1401. foreach my $bits (1 .. $max_length) {
  1402. $code = ($code + $length_counts[$bits - 1]) << 1;
  1403. $next_code[$bits] = $code;
  1404. }
  1405. # Step 3
  1406. my @code_table;
  1407. foreach my $n (0 .. $#{$code_lengths}) {
  1408. my $length = $code_lengths->[$n];
  1409. if ($length != 0) {
  1410. $code_table[$n] = sprintf('%0*b', $length, $next_code[$length]);
  1411. ++$next_code[$length];
  1412. }
  1413. }
  1414. my %dict;
  1415. my %rev_dict;
  1416. foreach my $i (0 .. $#{$code_lengths}) {
  1417. my $code = $code_table[$i];
  1418. if (defined($code)) {
  1419. $dict{$i} = $code;
  1420. $rev_dict{$code} = $i;
  1421. }
  1422. }
  1423. return (wantarray ? (\%dict, \%rev_dict) : \%dict);
  1424. }
  1425. # make a tree, and return resulting dictionaries
  1426. sub huffman_from_freq ($freq) {
  1427. my @nodes = map { [$_, $freq->{$_}] } sort { $a <=> $b } keys %$freq;
  1428. my $max_symbol = scalar(@nodes) ? $nodes[-1][0] : -1;
  1429. do { # poor man's priority queue
  1430. @nodes = sort { $a->[1] <=> $b->[1] } @nodes;
  1431. my ($x, $y) = splice(@nodes, 0, 2);
  1432. if (defined($x)) {
  1433. if (defined($y)) {
  1434. push @nodes, [[$x, $y], $x->[1] + $y->[1]];
  1435. }
  1436. else {
  1437. push @nodes, [[$x], $x->[1]];
  1438. }
  1439. }
  1440. } while (@nodes > 1);
  1441. my $h = _huffman_walk_tree($nodes[0], '', {});
  1442. my @code_lengths;
  1443. foreach my $i (0 .. $max_symbol) {
  1444. if (exists $h->{$i}) {
  1445. $code_lengths[$i] = length($h->{$i});
  1446. }
  1447. else {
  1448. $code_lengths[$i] = 0;
  1449. }
  1450. }
  1451. huffman_from_code_lengths(\@code_lengths);
  1452. }
  1453. sub huffman_from_symbols ($symbols) {
  1454. if (ref($symbols) eq '') {
  1455. $symbols = string2symbols($symbols);
  1456. }
  1457. huffman_from_freq(frequencies($symbols));
  1458. }
  1459. ########################
  1460. # Huffman Coding entries
  1461. ########################
  1462. sub create_huffman_entry ($symbols) {
  1463. if (ref($symbols) eq '') {
  1464. $symbols = string2symbols($symbols);
  1465. }
  1466. my $dict = huffman_from_symbols($symbols);
  1467. my $enc = huffman_encode($symbols, $dict);
  1468. my $max_symbol = max(keys %$dict) // 0;
  1469. $VERBOSE && say STDERR "Max symbol: $max_symbol\n";
  1470. my @code_lengths;
  1471. foreach my $i (0 .. $max_symbol) {
  1472. if (exists($dict->{$i})) {
  1473. $code_lengths[$i] = length($dict->{$i});
  1474. }
  1475. else {
  1476. $code_lengths[$i] = 0;
  1477. }
  1478. }
  1479. delta_encode(\@code_lengths) . pack("N", length($enc)) . pack("B*", $enc);
  1480. }
  1481. sub decode_huffman_entry ($fh) {
  1482. if (ref($fh) eq '') {
  1483. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  1484. return __SUB__->($fh2);
  1485. }
  1486. my $code_lengths = delta_decode($fh);
  1487. my (undef, $rev_dict) = huffman_from_code_lengths($code_lengths);
  1488. my $enc_len = bytes2int($fh, 4);
  1489. $VERBOSE && say STDERR "Encoded length: $enc_len\n";
  1490. if ($enc_len > 0) {
  1491. return huffman_decode(read_bits($fh, $enc_len), $rev_dict);
  1492. }
  1493. return [];
  1494. }
  1495. ###################################################################################
  1496. # DEFLATE-like encoding of literals and backreferences produced by the LZSS methods
  1497. ###################################################################################
  1498. sub make_deflate_tables ($max_dist = $LZ_MAX_DIST, $max_len = $LZ_MAX_LEN) {
  1499. # [distance value, offset bits]
  1500. my @DISTANCE_SYMBOLS = map { [$_, 0] } (0 .. 4);
  1501. until ($DISTANCE_SYMBOLS[-1][0] > $max_dist) {
  1502. push @DISTANCE_SYMBOLS, [int($DISTANCE_SYMBOLS[-1][0] * (4 / 3)), $DISTANCE_SYMBOLS[-1][1] + 1];
  1503. push @DISTANCE_SYMBOLS, [int($DISTANCE_SYMBOLS[-1][0] * (3 / 2)), $DISTANCE_SYMBOLS[-1][1]];
  1504. }
  1505. # [length, offset bits]
  1506. my @LENGTH_SYMBOLS = ((map { [$_, 0] } (1 .. 10)));
  1507. {
  1508. my $delta = 1;
  1509. until ($LENGTH_SYMBOLS[-1][0] > $max_len) {
  1510. push @LENGTH_SYMBOLS, [$LENGTH_SYMBOLS[-1][0] + $delta, $LENGTH_SYMBOLS[-1][1] + 1];
  1511. $delta *= 2;
  1512. push @LENGTH_SYMBOLS, [$LENGTH_SYMBOLS[-1][0] + $delta, $LENGTH_SYMBOLS[-1][1]];
  1513. push @LENGTH_SYMBOLS, [$LENGTH_SYMBOLS[-1][0] + $delta, $LENGTH_SYMBOLS[-1][1]];
  1514. push @LENGTH_SYMBOLS, [$LENGTH_SYMBOLS[-1][0] + $delta, $LENGTH_SYMBOLS[-1][1]];
  1515. }
  1516. while (@LENGTH_SYMBOLS and $LENGTH_SYMBOLS[-1][0] >= $max_len) {
  1517. pop @LENGTH_SYMBOLS;
  1518. }
  1519. push @LENGTH_SYMBOLS, [$max_len, 0];
  1520. }
  1521. my @LENGTH_INDICES;
  1522. foreach my $i (0 .. $#LENGTH_SYMBOLS) {
  1523. my ($min, $bits) = @{$LENGTH_SYMBOLS[$i]};
  1524. foreach my $k ($min .. $min + (1 << $bits) - 1) {
  1525. $LENGTH_INDICES[$k] = $i;
  1526. }
  1527. }
  1528. return (\@DISTANCE_SYMBOLS, \@LENGTH_SYMBOLS, \@LENGTH_INDICES);
  1529. }
  1530. sub find_deflate_index ($value, $table) {
  1531. foreach my $i (0 .. $#{$table}) {
  1532. if ($table->[$i][0] > $value) {
  1533. return $i - 1;
  1534. }
  1535. }
  1536. confess "error";
  1537. }
  1538. sub deflate_encode ($literals, $distances, $lengths, $entropy_sub = \&create_huffman_entry) {
  1539. my $max_dist = max(@$distances) // 0;
  1540. my $max_len = max(@$lengths) // 0;
  1541. my $max_symbol = (max(grep { defined($_) } @$literals) // -1) + 1;
  1542. my ($DISTANCE_SYMBOLS, $LENGTH_SYMBOLS, $LENGTH_INDICES) = make_deflate_tables($max_dist, $max_len);
  1543. my @len_symbols;
  1544. my @dist_symbols;
  1545. my $offset_bits = '';
  1546. foreach my $k (0 .. $#$literals) {
  1547. if ($lengths->[$k] == 0) {
  1548. push @len_symbols, $literals->[$k];
  1549. next;
  1550. }
  1551. my $len = $lengths->[$k];
  1552. my $dist = $distances->[$k];
  1553. {
  1554. my $len_idx = $LENGTH_INDICES->[$len];
  1555. my ($min, $bits) = @{$LENGTH_SYMBOLS->[$len_idx]};
  1556. push @len_symbols, $len_idx + $max_symbol;
  1557. if ($bits > 0) {
  1558. $offset_bits .= sprintf('%0*b', $bits, $len - $min);
  1559. }
  1560. }
  1561. {
  1562. my $dist_idx = find_deflate_index($dist, $DISTANCE_SYMBOLS);
  1563. my ($min, $bits) = @{$DISTANCE_SYMBOLS->[$dist_idx]};
  1564. push @dist_symbols, $dist_idx;
  1565. if ($bits > 0) {
  1566. $offset_bits .= sprintf('%0*b', $bits, $dist - $min);
  1567. }
  1568. }
  1569. }
  1570. fibonacci_encode([$max_symbol, $max_dist, $max_len]) . $entropy_sub->(\@len_symbols) . $entropy_sub->(\@dist_symbols) . pack('B*', $offset_bits);
  1571. }
  1572. sub deflate_decode ($fh, $entropy_sub = \&decode_huffman_entry) {
  1573. if (ref($fh) eq '') {
  1574. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  1575. return __SUB__->($fh2, $entropy_sub);
  1576. }
  1577. my ($max_symbol, $max_dist, $max_len) = @{fibonacci_decode($fh)};
  1578. my ($DISTANCE_SYMBOLS, $LENGTH_SYMBOLS) = make_deflate_tables($max_dist, $max_len);
  1579. my $len_symbols = $entropy_sub->($fh);
  1580. my $dist_symbols = $entropy_sub->($fh);
  1581. my $bits_len = 0;
  1582. foreach my $i (@$dist_symbols) {
  1583. $bits_len += $DISTANCE_SYMBOLS->[$i][1];
  1584. }
  1585. foreach my $i (@$len_symbols) {
  1586. if ($i >= $max_symbol) {
  1587. $bits_len += $LENGTH_SYMBOLS->[$i - $max_symbol][1];
  1588. }
  1589. }
  1590. my $bits = read_bits($fh, $bits_len);
  1591. my @literals;
  1592. my @lengths;
  1593. my @distances;
  1594. my $j = 0;
  1595. foreach my $i (@$len_symbols) {
  1596. if ($i >= $max_symbol) {
  1597. my $dist = $dist_symbols->[$j++];
  1598. push @literals, undef;
  1599. push @lengths, $LENGTH_SYMBOLS->[$i - $max_symbol][0] + oct('0b' . substr($bits, 0, $LENGTH_SYMBOLS->[$i - $max_symbol][1], ''));
  1600. push @distances, $DISTANCE_SYMBOLS->[$dist][0] + oct('0b' . substr($bits, 0, $DISTANCE_SYMBOLS->[$dist][1], ''));
  1601. }
  1602. else {
  1603. push @literals, $i;
  1604. push @lengths, 0;
  1605. push @distances, 0;
  1606. }
  1607. }
  1608. return (\@literals, \@distances, \@lengths);
  1609. }
  1610. #####################
  1611. # Elias gamma coding
  1612. #####################
  1613. sub elias_gamma_encode ($integers) {
  1614. my $bitstring = '';
  1615. foreach my $k (scalar(@$integers), @$integers) {
  1616. my $t = sprintf('%b', $k + 1);
  1617. $bitstring .= ('1' x (length($t) - 1)) . '0' . substr($t, 1);
  1618. }
  1619. pack('B*', $bitstring);
  1620. }
  1621. sub elias_gamma_decode ($fh) {
  1622. if (ref($fh) eq '') {
  1623. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  1624. return __SUB__->($fh2);
  1625. }
  1626. my @ints;
  1627. my $len = 0;
  1628. my $buffer = '';
  1629. for (my $k = 0 ; $k <= $len ; ++$k) {
  1630. my $n = 0;
  1631. ++$n while (read_bit($fh, \$buffer) eq '1');
  1632. push @ints, oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $n)) - 1;
  1633. if ($k == 0) {
  1634. $len = pop(@ints);
  1635. }
  1636. }
  1637. return \@ints;
  1638. }
  1639. #####################
  1640. # Elias omega coding
  1641. #####################
  1642. sub elias_omega_encode ($integers) {
  1643. my $bitstring = '';
  1644. foreach my $k (scalar(@$integers), @$integers) {
  1645. if ($k == 0) {
  1646. $bitstring .= '0';
  1647. }
  1648. else {
  1649. my $t = sprintf('%b', $k + 1);
  1650. my $l = length($t);
  1651. my $L = sprintf('%b', $l);
  1652. $bitstring .= ('1' x (length($L) - 1)) . '0' . substr($L, 1) . substr($t, 1);
  1653. }
  1654. }
  1655. pack('B*', $bitstring);
  1656. }
  1657. sub elias_omega_decode ($fh) {
  1658. if (ref($fh) eq '') {
  1659. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  1660. return __SUB__->($fh2);
  1661. }
  1662. my @ints;
  1663. my $len = 0;
  1664. my $buffer = '';
  1665. for (my $k = 0 ; $k <= $len ; ++$k) {
  1666. my $bl = 0;
  1667. ++$bl while (read_bit($fh, \$buffer) eq '1');
  1668. if ($bl > 0) {
  1669. my $bl2 = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $bl));
  1670. my $int = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. ($bl2 - 1))) - 1;
  1671. push @ints, $int;
  1672. }
  1673. else {
  1674. push @ints, 0;
  1675. }
  1676. if ($k == 0) {
  1677. $len = pop(@ints);
  1678. }
  1679. }
  1680. return \@ints;
  1681. }
  1682. ###################
  1683. # LZSS SYMBOLIC
  1684. ###################
  1685. sub lzss_encode_symbolic($symbols, %params) {
  1686. if (ref($symbols) eq '') {
  1687. return lzss_encode($symbols, %params);
  1688. }
  1689. my $min_len = $params{min_len} // $LZ_MIN_LEN;
  1690. my $max_len = $params{max_len} // $LZ_MAX_LEN;
  1691. my $max_dist = $params{max_dist} // $LZ_MAX_DIST;
  1692. my $max_chain_len = $params{max_chain_len} // $LZ_MAX_CHAIN_LEN;
  1693. my $end = $#$symbols;
  1694. my (@literals, @distances, @lengths, %table);
  1695. for (my $la = 0 ; $la <= $end ;) {
  1696. my $best_n = 1;
  1697. my $best_p = $la;
  1698. my $upto = $la + $min_len - 1;
  1699. my $lookahead = join(' ', @{$symbols}[$la .. ($upto > $end ? $end : $upto)]);
  1700. if (exists $table{$lookahead}) {
  1701. foreach my $p (@{$table{$lookahead}}) {
  1702. last if ($la - $p > $max_dist);
  1703. my $n = $min_len;
  1704. ++$n while ($la + $n <= $end and $symbols->[$la + $n - 1] == $symbols->[$p + $n - 1] and $n <= $max_len);
  1705. if ($n > $best_n) {
  1706. $best_n = $n;
  1707. $best_p = $p;
  1708. last if ($n > $max_len);
  1709. }
  1710. }
  1711. }
  1712. if ($best_n == 1) {
  1713. $table{$lookahead} = [$la];
  1714. }
  1715. else {
  1716. my @matched = @{$symbols}[$la .. $la + $best_n - 1];
  1717. my @key_arr = @matched[0 .. $min_len - 1];
  1718. foreach my $i (0 .. scalar(@matched) - $min_len) {
  1719. my $key = join(' ', @key_arr);
  1720. unshift @{$table{$key}}, $la + $i;
  1721. pop @{$table{$key}} if (@{$table{$key}} > $max_chain_len);
  1722. shift(@key_arr);
  1723. push @key_arr, $matched[$i + $min_len];
  1724. }
  1725. }
  1726. if ($best_n > $min_len) {
  1727. push @lengths, $best_n - 1;
  1728. push @distances, $la - $best_p;
  1729. push @literals, undef;
  1730. $la += $best_n - 1;
  1731. }
  1732. elsif ($best_n == 1) {
  1733. push @lengths, 0;
  1734. push @distances, 0;
  1735. push @literals, $symbols->[$la++];
  1736. }
  1737. else {
  1738. push @lengths, (0) x $best_n;
  1739. push @distances, (0) x $best_n;
  1740. push @literals, @{$symbols}[$la .. $la + $best_n - 1];
  1741. $la += $best_n;
  1742. }
  1743. }
  1744. return (\@literals, \@distances, \@lengths);
  1745. }
  1746. sub lzss_decode_symbolic ($literals, $distances, $lengths) {
  1747. my @data;
  1748. my $data_len = 0;
  1749. foreach my $i (0 .. $#$lengths) {
  1750. if ($lengths->[$i] == 0) {
  1751. push @data, $literals->[$i];
  1752. $data_len += 1;
  1753. next;
  1754. }
  1755. my $length = $lengths->[$i] // confess "bad input";
  1756. my $dist = $distances->[$i] // confess "bad input";
  1757. if ($dist >= $length) { # non-overlapping matches
  1758. push @data, @data[$data_len - $dist .. $data_len - $dist + $length - 1];
  1759. }
  1760. elsif ($dist == 1) { # run-length of last character
  1761. push @data, ($data[-1]) x $length;
  1762. }
  1763. else { # overlapping matches
  1764. foreach my $j (1 .. $length) {
  1765. push @data, $data[$data_len + $j - $dist - 1];
  1766. }
  1767. }
  1768. $data_len += $length;
  1769. }
  1770. return \@data;
  1771. }
  1772. ###################
  1773. # LZSS Encoding
  1774. ###################
  1775. sub lzss_encode ($str, %params) {
  1776. if (ref($str) ne '') {
  1777. return lzss_encode_symbolic($str, %params);
  1778. }
  1779. my $min_len = $params{min_len} // $LZ_MIN_LEN;
  1780. my $max_len = $params{max_len} // $LZ_MAX_LEN;
  1781. my $max_dist = $params{max_dist} // $LZ_MAX_DIST;
  1782. my $max_chain_len = $params{max_chain_len} // $LZ_MAX_CHAIN_LEN;
  1783. my @symbols = unpack('C*', $str);
  1784. my $end = $#symbols;
  1785. my (@literals, @distances, @lengths, %table);
  1786. for (my $la = 0 ; $la <= $end ;) {
  1787. my $best_n = 1;
  1788. my $best_p = $la;
  1789. my $lookahead = substr($str, $la, $min_len);
  1790. if (exists $table{$lookahead}) {
  1791. foreach my $p (@{$table{$lookahead}}) {
  1792. last if ($la - $p > $max_dist);
  1793. my $n = $min_len;
  1794. ++$n while ($la + $n <= $end and $symbols[$la + $n - 1] == $symbols[$p + $n - 1] and $n <= $max_len);
  1795. if ($n > $best_n) {
  1796. $best_p = $p;
  1797. $best_n = $n;
  1798. last if ($best_n > $max_len);
  1799. }
  1800. }
  1801. }
  1802. if ($best_n == 1) {
  1803. $table{$lookahead} = [$la];
  1804. }
  1805. else {
  1806. my $matched = substr($str, $la, $best_n);
  1807. foreach my $i (0 .. $best_n - $min_len) {
  1808. my $key = substr($matched, $i, $min_len);
  1809. unshift @{$table{$key}}, $la + $i;
  1810. pop(@{$table{$key}}) if (@{$table{$key}} > $max_chain_len);
  1811. }
  1812. }
  1813. if ($best_n == 1) {
  1814. $table{$lookahead} = [$la];
  1815. }
  1816. if ($best_n > $min_len) {
  1817. push @lengths, $best_n - 1;
  1818. push @distances, $la - $best_p;
  1819. push @literals, undef;
  1820. $la += $best_n - 1;
  1821. }
  1822. elsif ($best_n == 1) {
  1823. push @lengths, 0;
  1824. push @distances, 0;
  1825. push @literals, $symbols[$la++];
  1826. }
  1827. else {
  1828. push @lengths, (0) x $best_n;
  1829. push @distances, (0) x $best_n;
  1830. push @literals, @symbols[$la .. $la + $best_n - 1];
  1831. $la += $best_n;
  1832. }
  1833. }
  1834. return (\@literals, \@distances, \@lengths);
  1835. }
  1836. sub lzss_decode ($literals, $distances, $lengths) {
  1837. my $data = '';
  1838. my $data_len = 0;
  1839. foreach my $i (0 .. $#$lengths) {
  1840. if ($lengths->[$i] == 0) {
  1841. $data .= chr($literals->[$i]);
  1842. ++$data_len;
  1843. next;
  1844. }
  1845. my $length = $lengths->[$i] // confess "bad input";
  1846. my $dist = $distances->[$i] // confess "bad input";
  1847. if ($dist >= $length) { # non-overlapping matches
  1848. $data .= substr($data, $data_len - $dist, $length) // confess "bad input";
  1849. }
  1850. elsif ($dist == 1) { # run-length of last character
  1851. $data .= substr($data, -1) x $length;
  1852. }
  1853. else { # overlapping matches
  1854. foreach my $i (1 .. $length) {
  1855. $data .= substr($data, $data_len + $i - $dist - 1, 1) // confess "bad input";
  1856. }
  1857. }
  1858. $data_len += $length;
  1859. }
  1860. return $data;
  1861. }
  1862. ###################
  1863. # LZSSF Compression
  1864. ###################
  1865. sub lzss_encode_fast_symbolic ($symbols, %params) {
  1866. if (ref($symbols) eq '') {
  1867. return lzss_encode_fast($symbols, %params);
  1868. }
  1869. my $la = 0;
  1870. my $end = $#$symbols;
  1871. my $min_len = $params{min_len} // $LZ_MIN_LEN; # minimum match length
  1872. my $max_len = $params{max_len} // $LZ_MAX_LEN; # maximum match length
  1873. my $max_dist = $params{max_dist} // $LZ_MAX_DIST; # maximum offset distance
  1874. my (@literals, @distances, @lengths, %table);
  1875. while ($la <= $end) {
  1876. my $best_n = 1;
  1877. my $best_p = $la;
  1878. my $upto = $la + $min_len - 1;
  1879. my $lookahead = join(' ', @{$symbols}[$la .. ($upto > $end ? $end : $upto)]);
  1880. if (exists($table{$lookahead}) and $la - $table{$lookahead} <= $max_dist) {
  1881. my $p = $table{$lookahead};
  1882. my $n = $min_len;
  1883. ++$n while ($la + $n <= $end and $symbols->[$la + $n - 1] == $symbols->[$p + $n - 1] and $n <= $max_len);
  1884. $best_p = $p;
  1885. $best_n = $n;
  1886. }
  1887. $table{$lookahead} = $la;
  1888. if ($best_n > $min_len) {
  1889. push @lengths, $best_n - 1;
  1890. push @distances, $la - $best_p;
  1891. push @literals, undef;
  1892. $la += $best_n - 1;
  1893. }
  1894. elsif ($best_n == 1) {
  1895. push @lengths, 0;
  1896. push @distances, 0;
  1897. push @literals, $symbols->[$la++];
  1898. }
  1899. else {
  1900. push @lengths, (0) x $best_n;
  1901. push @distances, (0) x $best_n;
  1902. push @literals, @{$symbols}[$la .. $la + $best_n - 1];
  1903. $la += $best_n;
  1904. }
  1905. }
  1906. return (\@literals, \@distances, \@lengths);
  1907. }
  1908. sub lzss_encode_fast($str, %params) {
  1909. if (ref($str) ne '') {
  1910. return lzss_encode_fast_symbolic($str, %params);
  1911. }
  1912. my @symbols = unpack('C*', $str);
  1913. my $la = 0;
  1914. my $end = $#symbols;
  1915. my $min_len = $params{min_len} // $LZ_MIN_LEN; # minimum match length
  1916. my $max_len = $params{max_len} // $LZ_MAX_LEN; # maximum match length
  1917. my $max_dist = $params{max_dist} // $LZ_MAX_DIST; # maximum offset distance
  1918. my (@literals, @distances, @lengths, %table);
  1919. while ($la <= $end) {
  1920. my $best_n = 1;
  1921. my $best_p = $la;
  1922. my $lookahead = substr($str, $la, $min_len);
  1923. if (exists($table{$lookahead}) and $la - $table{$lookahead} <= $max_dist) {
  1924. my $p = $table{$lookahead};
  1925. my $n = $min_len;
  1926. ++$n while ($la + $n <= $end and $symbols[$la + $n - 1] == $symbols[$p + $n - 1] and $n <= $max_len);
  1927. $best_p = $p;
  1928. $best_n = $n;
  1929. }
  1930. $table{$lookahead} = $la;
  1931. if ($best_n > $min_len) {
  1932. push @lengths, $best_n - 1;
  1933. push @distances, $la - $best_p;
  1934. push @literals, undef;
  1935. $la += $best_n - 1;
  1936. }
  1937. elsif ($best_n == 1) {
  1938. push @lengths, 0;
  1939. push @distances, 0;
  1940. push @literals, $symbols[$la++];
  1941. }
  1942. else {
  1943. push @lengths, (0) x $best_n;
  1944. push @distances, (0) x $best_n;
  1945. push @literals, @symbols[$la .. $la + $best_n - 1];
  1946. $la += $best_n;
  1947. }
  1948. }
  1949. return (\@literals, \@distances, \@lengths);
  1950. }
  1951. ################################
  1952. # LZ77 encoding, inspired by LZ4
  1953. ################################
  1954. sub lz77_encode($chunk, $lzss_encoding_sub = \&lzss_encode) {
  1955. local $LZ_MAX_LEN = ~0; # maximum match length
  1956. my ($literals, $distances, $lengths) = $lzss_encoding_sub->($chunk);
  1957. my $literals_end = $#{$literals};
  1958. my (@symbols, @len_symbols, @match_symbols, @dist_symbols);
  1959. for (my $i = 0 ; $i <= $literals_end ; ++$i) {
  1960. my $j = $i;
  1961. while ($i <= $literals_end and defined($literals->[$i])) {
  1962. ++$i;
  1963. }
  1964. my $literals_length = $i - $j;
  1965. my $match_len = $lengths->[$i] // 0;
  1966. push @match_symbols, (($literals_length >= 7 ? 7 : $literals_length) << 5) | ($match_len >= 31 ? 31 : $match_len);
  1967. $literals_length -= 7;
  1968. $match_len -= 31;
  1969. while ($literals_length >= 0) {
  1970. push @len_symbols, ($literals_length >= 255 ? 255 : $literals_length);
  1971. $literals_length -= 255;
  1972. }
  1973. if ($i > $j) {
  1974. push @symbols, @{$literals}[$j .. $i - 1];
  1975. }
  1976. while ($match_len >= 0) {
  1977. push @match_symbols, ($match_len >= 255 ? 255 : $match_len);
  1978. $match_len -= 255;
  1979. }
  1980. push @dist_symbols, $distances->[$i] // 0;
  1981. }
  1982. return (\@symbols, \@dist_symbols, \@len_symbols, \@match_symbols);
  1983. }
  1984. *lz77_encode_symbolic = \&lz77_encode;
  1985. sub lz77_decode($symbols, $dist_symbols, $len_symbols, $match_symbols) {
  1986. my $data = '';
  1987. my $data_len = 0;
  1988. my @symbols = @$symbols;
  1989. my @len_symbols = @$len_symbols;
  1990. my @match_symbols = @$match_symbols;
  1991. my @dist_symbols = @$dist_symbols;
  1992. while (@symbols) {
  1993. my $len_byte = shift(@match_symbols) // confess "bad input";
  1994. my $literals_length = $len_byte >> 5;
  1995. my $match_len = $len_byte & 0b11111;
  1996. if ($literals_length == 7) {
  1997. while (1) {
  1998. my $byte_len = shift(@len_symbols) // confess "bad input";
  1999. $literals_length += $byte_len;
  2000. last if $byte_len != 255;
  2001. }
  2002. }
  2003. if ($literals_length > 0) {
  2004. $data .= pack("C*", splice(@symbols, 0, $literals_length));
  2005. $data_len += $literals_length;
  2006. }
  2007. if ($match_len == 31) {
  2008. while (1) {
  2009. my $byte_len = shift(@match_symbols) // confess "bad input";
  2010. $match_len += $byte_len;
  2011. last if $byte_len != 255;
  2012. }
  2013. }
  2014. my $dist = shift(@dist_symbols) // confess "bad input";
  2015. if ($dist >= $match_len) { # non-overlapping matches
  2016. $data .= substr($data, $data_len - $dist, $match_len) // confess "bad input";
  2017. }
  2018. elsif ($dist == 1) { # run-length of last character
  2019. $data .= substr($data, -1) x $match_len;
  2020. }
  2021. else { # overlapping matches
  2022. foreach my $i (1 .. $match_len) {
  2023. $data .= substr($data, $data_len + $i - $dist - 1, 1) // confess "bad input";
  2024. }
  2025. }
  2026. $data_len += $match_len;
  2027. }
  2028. return $data;
  2029. }
  2030. sub lz77_decode_symbolic($symbols, $dist_symbols, $len_symbols, $match_symbols) {
  2031. my @data;
  2032. my $data_len = 0;
  2033. my @symbols = @$symbols;
  2034. my @len_symbols = @$len_symbols;
  2035. my @match_symbols = @$match_symbols;
  2036. my @dist_symbols = @$dist_symbols;
  2037. while (@symbols) {
  2038. my $len_byte = shift(@match_symbols) // confess "bad input";
  2039. my $literals_length = $len_byte >> 5;
  2040. my $match_len = $len_byte & 0b11111;
  2041. if ($literals_length == 7) {
  2042. while (1) {
  2043. my $byte_len = shift(@len_symbols) // confess "bad input";
  2044. $literals_length += $byte_len;
  2045. last if $byte_len != 255;
  2046. }
  2047. }
  2048. if ($literals_length > 0) {
  2049. push @data, splice(@symbols, 0, $literals_length);
  2050. $data_len += $literals_length;
  2051. }
  2052. if ($match_len == 31) {
  2053. while (1) {
  2054. my $byte_len = shift(@match_symbols) // confess "bad input";
  2055. $match_len += $byte_len;
  2056. last if $byte_len != 255;
  2057. }
  2058. }
  2059. my $dist = shift(@dist_symbols) // confess "bad input";
  2060. if ($dist >= $match_len) { # non-overlapping matches
  2061. push @data, @data[scalar(@data) - $dist .. scalar(@data) - $dist + $match_len - 1];
  2062. }
  2063. elsif ($dist == 1) { # run-length of last character
  2064. push @data, ($data[-1]) x $match_len;
  2065. }
  2066. else { # overlapping matches
  2067. foreach my $j (1 .. $match_len) {
  2068. push @data, $data[$data_len + $j - $dist - 1];
  2069. }
  2070. }
  2071. $data_len += $match_len;
  2072. }
  2073. return \@data;
  2074. }
  2075. sub lz77_compress($chunk, $entropy_sub = \&create_huffman_entry, $lzss_encoding_sub = \&lzss_encode) {
  2076. my ($symbols, $dist_symbols, $len_symbols, $match_symbols) = lz77_encode($chunk, $lzss_encoding_sub);
  2077. $entropy_sub->($symbols) . $entropy_sub->($len_symbols) . $entropy_sub->($match_symbols) . obh_encode($dist_symbols, $entropy_sub);
  2078. }
  2079. *lz77_compress_symbolic = \&lz77_compress;
  2080. sub lz77_decompress($fh, $entropy_sub = \&decode_huffman_entry) {
  2081. if (ref($fh) eq '') {
  2082. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  2083. return __SUB__->($fh2, $entropy_sub);
  2084. }
  2085. my $symbols = $entropy_sub->($fh);
  2086. my $len_symbols = $entropy_sub->($fh);
  2087. my $match_symbols = $entropy_sub->($fh);
  2088. my $dist_symbols = obh_decode($fh, $entropy_sub);
  2089. lz77_decode($symbols, $dist_symbols, $len_symbols, $match_symbols);
  2090. }
  2091. sub lz77_decompress_symbolic($fh, $entropy_sub = \&decode_huffman_entry) {
  2092. if (ref($fh) eq '') {
  2093. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  2094. return __SUB__->($fh2, $entropy_sub);
  2095. }
  2096. my $symbols = $entropy_sub->($fh);
  2097. my $len_symbols = $entropy_sub->($fh);
  2098. my $match_symbols = $entropy_sub->($fh);
  2099. my $dist_symbols = obh_decode($fh, $entropy_sub);
  2100. lz77_decode_symbolic($symbols, $dist_symbols, $len_symbols, $match_symbols);
  2101. }
  2102. #########################
  2103. # LZSS + DEFLATE encoding
  2104. #########################
  2105. sub lzss_compress($chunk, $entropy_sub = \&create_huffman_entry, $lzss_encoding_sub = \&lzss_encode) {
  2106. my ($literals, $distances, $lengths) = $lzss_encoding_sub->($chunk);
  2107. deflate_encode($literals, $distances, $lengths, $entropy_sub);
  2108. }
  2109. *lzss_compress_symbolic = \&lzss_compress;
  2110. sub lzss_decompress($fh, $entropy_sub = \&decode_huffman_entry) {
  2111. if (ref($fh) eq '') {
  2112. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  2113. return __SUB__->($fh2, $entropy_sub);
  2114. }
  2115. my ($literals, $distances, $lengths) = deflate_decode($fh, $entropy_sub);
  2116. lzss_decode($literals, $distances, $lengths);
  2117. }
  2118. sub lzss_decompress_symbolic($fh, $entropy_sub = \&decode_huffman_entry) {
  2119. if (ref($fh) eq '') {
  2120. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  2121. return __SUB__->($fh2, $entropy_sub);
  2122. }
  2123. my ($literals, $distances, $lengths) = deflate_decode($fh, $entropy_sub);
  2124. lzss_decode_symbolic($literals, $distances, $lengths);
  2125. }
  2126. #########################################
  2127. # LZB -- LZSS with byte-oriented encoding
  2128. #########################################
  2129. sub lzb_compress ($chunk, $lzss_encoding_sub = \&lzss_encode) {
  2130. my ($literals, $distances, $lengths) = do {
  2131. local $LZ_MAX_DIST = (1 << 16) - 1;
  2132. local $LZ_MAX_LEN = ~0;
  2133. $lzss_encoding_sub->($chunk);
  2134. };
  2135. my $literals_end = $#{$literals};
  2136. my $data = '';
  2137. for (my $i = 0 ; $i <= $literals_end ; ++$i) {
  2138. my $j = $i;
  2139. while ($i <= $literals_end and defined($literals->[$i])) {
  2140. ++$i;
  2141. }
  2142. my $literals_length = $i - $j;
  2143. my $match_len = $lengths->[$i] // 0;
  2144. $data .= chr((($literals_length >= 7 ? 7 : $literals_length) << 5) | ($match_len >= 31 ? 31 : $match_len));
  2145. $literals_length -= 7;
  2146. $match_len -= 31;
  2147. while ($literals_length >= 0) {
  2148. $data .= $literals_length >= 255 ? "\xff" : chr($literals_length);
  2149. $literals_length -= 255;
  2150. }
  2151. if ($i > $j) {
  2152. $data .= pack('C*', @{$literals}[$j .. $i - 1]);
  2153. }
  2154. while ($match_len >= 0) {
  2155. $data .= $match_len >= 255 ? "\xff" : chr($match_len);
  2156. $match_len -= 255;
  2157. }
  2158. $data .= pack('B*', sprintf('%016b', $distances->[$i] // 0));
  2159. }
  2160. return fibonacci_encode([length $data]) . $data;
  2161. }
  2162. sub lzb_decompress($fh) {
  2163. if (ref($fh) eq '') {
  2164. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  2165. return __SUB__->($fh2);
  2166. }
  2167. my $data = '';
  2168. my $search_window = '';
  2169. my $search_window_size = 1 << 16;
  2170. my $block_size = fibonacci_decode($fh)->[0] // confess "decompression error";
  2171. read($fh, (my $block), $block_size) // confess "Read error: $!";
  2172. while ($block ne '') {
  2173. my $len_byte = ord substr($block, 0, 1, '');
  2174. my $literals_length = $len_byte >> 5;
  2175. my $match_len = $len_byte & 0b11111;
  2176. if ($literals_length == 7) {
  2177. while (1) {
  2178. my $byte_len = ord substr($block, 0, 1, '');
  2179. $literals_length += $byte_len;
  2180. last if $byte_len != 255;
  2181. }
  2182. }
  2183. if ($literals_length > 0) {
  2184. $search_window .= substr($block, 0, $literals_length, '');
  2185. }
  2186. if ($match_len == 31) {
  2187. while (1) {
  2188. my $byte_len = ord substr($block, 0, 1, '');
  2189. $match_len += $byte_len;
  2190. last if $byte_len != 255;
  2191. }
  2192. }
  2193. my $offset = oct('0b' . unpack('B*', substr($block, 0, 2, '')));
  2194. if ($offset >= $match_len) { # non-overlapping matches
  2195. $search_window .= substr($search_window, length($search_window) - $offset, $match_len);
  2196. }
  2197. elsif ($offset == 1) { # run-length of last character
  2198. $search_window .= substr($search_window, -1) x $match_len;
  2199. }
  2200. else { # overlapping matches
  2201. foreach my $i (1 .. $match_len) {
  2202. $search_window .= substr($search_window, length($search_window) - $offset, 1);
  2203. }
  2204. }
  2205. $data .= substr($search_window, -($match_len + $literals_length));
  2206. $search_window = substr($search_window, -$search_window_size) if (length($search_window) > 2 * $search_window_size);
  2207. }
  2208. return $data;
  2209. }
  2210. ################################################################
  2211. # Encode a list of symbols, using offset bits and huffman coding
  2212. ################################################################
  2213. sub obh_encode ($distances, $entropy_sub = \&create_huffman_entry) {
  2214. my $max_dist = max(@$distances) // 0;
  2215. my ($DISTANCE_SYMBOLS) = make_deflate_tables($max_dist, 0);
  2216. my @symbols;
  2217. my $offset_bits = '';
  2218. foreach my $dist (@$distances) {
  2219. my $i = find_deflate_index($dist, $DISTANCE_SYMBOLS);
  2220. my ($min, $bits) = @{$DISTANCE_SYMBOLS->[$i]};
  2221. push @symbols, $i;
  2222. if ($bits > 0) {
  2223. $offset_bits .= sprintf('%0*b', $bits, $dist - $min);
  2224. }
  2225. }
  2226. fibonacci_encode([$max_dist]) . $entropy_sub->(\@symbols) . pack('B*', $offset_bits);
  2227. }
  2228. sub obh_decode ($fh, $entropy_sub = \&decode_huffman_entry) {
  2229. if (ref($fh) eq '') {
  2230. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  2231. return __SUB__->($fh2, $entropy_sub);
  2232. }
  2233. my $max_dist = fibonacci_decode($fh)->[0];
  2234. my ($DISTANCE_SYMBOLS) = make_deflate_tables($max_dist, 0);
  2235. my $symbols = $entropy_sub->($fh);
  2236. my $bits_len = 0;
  2237. foreach my $i (@$symbols) {
  2238. $bits_len += $DISTANCE_SYMBOLS->[$i][1];
  2239. }
  2240. my $bits = read_bits($fh, $bits_len);
  2241. my @distances;
  2242. foreach my $i (@$symbols) {
  2243. push @distances, $DISTANCE_SYMBOLS->[$i][0] + oct('0b' . substr($bits, 0, $DISTANCE_SYMBOLS->[$i][1], ''));
  2244. }
  2245. return \@distances;
  2246. }
  2247. #################
  2248. # LZW Compression
  2249. #################
  2250. sub lzw_encode ($uncompressed) {
  2251. # Build the dictionary
  2252. my $dict_size = 256;
  2253. my %dictionary;
  2254. foreach my $i (0 .. $dict_size - 1) {
  2255. $dictionary{chr($i)} = $i;
  2256. }
  2257. my $w = '';
  2258. my @result;
  2259. foreach my $c (split(//, $uncompressed)) {
  2260. my $wc = $w . $c;
  2261. if (exists $dictionary{$wc}) {
  2262. $w = $wc;
  2263. }
  2264. else {
  2265. push @result, $dictionary{$w};
  2266. # Add wc to the dictionary
  2267. $dictionary{$wc} = $dict_size++;
  2268. $w = $c;
  2269. }
  2270. }
  2271. # Output the code for w
  2272. if ($w ne '') {
  2273. push @result, $dictionary{$w};
  2274. }
  2275. return \@result;
  2276. }
  2277. sub lzw_decode ($compressed) {
  2278. @$compressed || return '';
  2279. # Build the dictionary
  2280. my $dict_size = 256;
  2281. my @dictionary = map { chr($_) } 0 .. $dict_size - 1;
  2282. my $w = $dictionary[$compressed->[0]];
  2283. my $result = $w;
  2284. foreach my $j (1 .. $#$compressed) {
  2285. my $k = $compressed->[$j];
  2286. my $entry =
  2287. ($k < $dict_size) ? $dictionary[$k]
  2288. : ($k == $dict_size) ? ($w . substr($w, 0, 1))
  2289. : confess "Bad compressed k: $k";
  2290. $result .= $entry;
  2291. # Add w+entry[0] to the dictionary
  2292. push @dictionary, $w . substr($entry, 0, 1);
  2293. ++$dict_size;
  2294. $w = $entry;
  2295. }
  2296. return $result;
  2297. }
  2298. sub lzw_compress ($chunk, $enc_method = \&abc_encode) {
  2299. $enc_method->(lzw_encode($chunk));
  2300. }
  2301. sub lzw_decompress ($fh, $dec_method = \&abc_decode) {
  2302. if (ref($fh) eq '') {
  2303. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  2304. return __SUB__->($fh2, $dec_method);
  2305. }
  2306. lzw_decode($dec_method->($fh));
  2307. }
  2308. ###################################
  2309. # CRC-32 Pure Perl implementation
  2310. ###################################
  2311. sub _create_crc32_table {
  2312. my @table;
  2313. for my $i (0 .. 255) {
  2314. my $k = $i;
  2315. for (0 .. 7) {
  2316. if ($k & 1) {
  2317. $k >>= 1;
  2318. $k ^= 0xedb88320;
  2319. }
  2320. else {
  2321. $k >>= 1;
  2322. }
  2323. }
  2324. push(@table, $k & 0xffffffff);
  2325. }
  2326. return \@table;
  2327. }
  2328. sub crc32($str, $crc = 0) {
  2329. state $crc_table = _create_crc32_table();
  2330. $crc &= 0xffffffff;
  2331. $crc ^= 0xffffffff;
  2332. foreach my $c (unpack("C*", $str)) {
  2333. $crc = (($crc >> 8) ^ $crc_table->[($crc & 0xff) ^ $c]);
  2334. }
  2335. return (($crc & 0xffffffff) ^ 0xffffffff);
  2336. }
  2337. #############################
  2338. # Bzip2 compression
  2339. #############################
  2340. sub _bzip2_encode_code_lengths($dict) {
  2341. my @lengths;
  2342. foreach my $symbol (0 .. max(keys %$dict) // 0) {
  2343. if (exists($dict->{$symbol})) {
  2344. push @lengths, length($dict->{$symbol});
  2345. }
  2346. else {
  2347. confess "Incomplete Huffman tree not supported";
  2348. push @lengths, 0;
  2349. }
  2350. }
  2351. my $deltas = deltas(\@lengths);
  2352. $VERBOSE && say STDERR "Code lengths: (@lengths)";
  2353. $VERBOSE && say STDERR "Code lengths deltas: (@$deltas)";
  2354. my $bitstring = int2bits(shift(@$deltas), 5) . '0';
  2355. foreach my $d (@$deltas) {
  2356. $bitstring .= (($d > 0) ? ('10' x $d) : ('11' x abs($d))) . '0';
  2357. }
  2358. $VERBOSE && say STDERR "Deltas bitstring: $bitstring";
  2359. return $bitstring;
  2360. }
  2361. sub bzip2_compress($fh) {
  2362. if (ref($fh) eq '') {
  2363. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  2364. return __SUB__->($fh2);
  2365. }
  2366. my $level = 9;
  2367. # There is a CRC32 issue on some non-compressible inputs, when using very large chunk sizes
  2368. ## my $CHUNK_SIZE = 100_000 * $level;
  2369. my $CHUNK_SIZE = 1 << 17;
  2370. my $compressed = "BZh" . $level;
  2371. state $block_header_bitstring = unpack("B48", "1AY&SY");
  2372. state $block_footer_bitstring = unpack("B48", "\27rE8P\x90");
  2373. my $bitstring = '';
  2374. my $stream_crc32 = 0;
  2375. while (read($fh, (my $chunk), $CHUNK_SIZE)) {
  2376. $bitstring .= $block_header_bitstring;
  2377. my $crc32 = crc32(pack('b*', unpack('B*', $chunk)));
  2378. $VERBOSE && say STDERR "CRC32: $crc32";
  2379. $crc32 = oct('0b' . int2bits_lsb($crc32, 32));
  2380. $VERBOSE && say STDERR "Bzip2-CRC32: $crc32";
  2381. $stream_crc32 = ($crc32 ^ (0xffffffff & ((0xffffffff & ($stream_crc32 << 1)) | (($stream_crc32 >> 31) & 0x1)))) & 0xffffffff;
  2382. $bitstring .= int2bits($crc32, 32);
  2383. $bitstring .= '0'; # not randomized
  2384. my $rle4 = rle4_encode($chunk);
  2385. my ($bwt, $bwt_idx) = bwt_encode(symbols2string($rle4));
  2386. $bitstring .= int2bits($bwt_idx, 24);
  2387. my ($mtf, $alphabet) = mtf_encode($bwt);
  2388. $VERBOSE && say STDERR "Alphabet: (@$alphabet)";
  2389. $bitstring .= unpack('B*', encode_alphabet_256($alphabet));
  2390. my @zrle = reverse @{zrle_encode([reverse @$mtf])};
  2391. my $eob = scalar(@$alphabet) + 1; # end-of-block symbol
  2392. $VERBOSE && say STDERR "EOB symbol: $eob";
  2393. push @zrle, $eob;
  2394. my ($dict) = huffman_from_symbols([@zrle, 0 .. $eob - 1]);
  2395. my $num_sels = int(sprintf('%.0f', 0.5 + (scalar(@zrle) / 50))); # ceil(|zrle| / 50)
  2396. $VERBOSE && say STDERR "Number of selectors: $num_sels";
  2397. $bitstring .= int2bits(2, 3);
  2398. $bitstring .= int2bits($num_sels, 15);
  2399. $bitstring .= '0' x $num_sels;
  2400. $bitstring .= _bzip2_encode_code_lengths($dict) x 2;
  2401. $bitstring .= join('', @{$dict}{@zrle});
  2402. $compressed .= pack('B*', substr($bitstring, 0, length($bitstring) - (length($bitstring) % 8), ''));
  2403. }
  2404. $bitstring .= $block_footer_bitstring;
  2405. $bitstring .= int2bits($stream_crc32, 32);
  2406. $compressed .= pack('B*', $bitstring);
  2407. return $compressed;
  2408. }
  2409. #################################
  2410. # Bzip2 decompression
  2411. #################################
  2412. sub bzip2_decompress($fh) {
  2413. if (ref($fh) eq '') {
  2414. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  2415. return __SUB__->($fh2);
  2416. }
  2417. state $MaxHuffmanBits = 20;
  2418. my $decompressed = '';
  2419. while (!eof($fh)) {
  2420. my $buffer = '';
  2421. (bytes2int($fh, 2) == 0x425a and getc($fh) eq 'h')
  2422. or confess "Not a valid Bzip2 archive";
  2423. my $level = getc($fh);
  2424. if ($level !~ /^[1-9]\z/) {
  2425. confess "Invalid level: $level";
  2426. }
  2427. $VERBOSE && say STDERR "Compression level: $level";
  2428. my $stream_crc32 = 0;
  2429. while (!eof($fh)) {
  2430. my $block_magic = pack "B48", join('', map { read_bit($fh, \$buffer) } 1 .. 48);
  2431. if ($block_magic eq "1AY&SY") { # BlockHeader
  2432. $VERBOSE && say STDERR "Block header detected";
  2433. my $crc32 = bits2int($fh, 32, \$buffer);
  2434. $VERBOSE && say STDERR "CRC32 = $crc32";
  2435. $stream_crc32 = ($crc32 ^ (0xffffffff & ((0xffffffff & ($stream_crc32 << 1)) | (($stream_crc32 >> 31) & 0x1)))) & 0xffffffff;
  2436. my $randomized = read_bit($fh, \$buffer);
  2437. $randomized == 0 or confess "randomized not supported";
  2438. my $bwt_idx = bits2int($fh, 24, \$buffer);
  2439. $VERBOSE && say STDERR "BWT index: $bwt_idx";
  2440. my @alphabet;
  2441. my $l1 = bits2int($fh, 16, \$buffer);
  2442. for my $i (0 .. 15) {
  2443. if ($l1 & (0x8000 >> $i)) {
  2444. my $l2 = bits2int($fh, 16, \$buffer);
  2445. for my $j (0 .. 15) {
  2446. if ($l2 & (0x8000 >> $j)) {
  2447. push @alphabet, 16 * $i + $j;
  2448. }
  2449. }
  2450. }
  2451. }
  2452. $VERBOSE && say STDERR "MTF alphabet: (@alphabet)";
  2453. my $num_trees = bits2int($fh, 3, \$buffer);
  2454. $VERBOSE && say STDERR "Number or trees: $num_trees";
  2455. my $num_sels = bits2int($fh, 15, \$buffer);
  2456. $VERBOSE && say STDERR "Number of selectors: $num_sels";
  2457. my @idxs;
  2458. for (1 .. $num_sels) {
  2459. my $i = 0;
  2460. while (read_bit($fh, \$buffer)) {
  2461. $i += 1;
  2462. ($i < $num_trees) or confess "error";
  2463. }
  2464. push @idxs, $i;
  2465. }
  2466. my $sels = mtf_decode(\@idxs, [0 .. $num_trees - 1]);
  2467. $VERBOSE && say STDERR "Selectors: (@$sels)";
  2468. my $num_syms = scalar(@alphabet) + 2;
  2469. my @trees;
  2470. for (1 .. $num_trees) {
  2471. my @clens;
  2472. my $clen = bits2int($fh, 5, \$buffer);
  2473. for (1 .. $num_syms) {
  2474. while (1) {
  2475. ($clen > 0 and $clen <= $MaxHuffmanBits) or confess "invalid code length: $clen";
  2476. if (not read_bit($fh, \$buffer)) {
  2477. last;
  2478. }
  2479. $clen -= read_bit($fh, \$buffer) ? 1 : -1;
  2480. }
  2481. push @clens, $clen;
  2482. }
  2483. push @trees, \@clens;
  2484. $VERBOSE && say STDERR "Code lengths: (@clens)";
  2485. }
  2486. foreach my $tree (@trees) {
  2487. my $maxLen = max(@$tree);
  2488. my $sum = 1 << $maxLen;
  2489. for my $clen (@$tree) {
  2490. $sum -= (1 << $maxLen) >> $clen;
  2491. }
  2492. $sum == 0 or confess "incomplete tree not supported: (@$tree)";
  2493. }
  2494. my @huffman_trees = map { (huffman_from_code_lengths($_))[1] } @trees;
  2495. my $eob = @alphabet + 1;
  2496. my @zrle;
  2497. my $code = '';
  2498. my $sel_idx = 0;
  2499. my $tree = $huffman_trees[$sels->[$sel_idx]];
  2500. my $decoded = 50;
  2501. while (!eof($fh)) {
  2502. $code .= read_bit($fh, \$buffer);
  2503. if (length($code) > $MaxHuffmanBits) {
  2504. confess "[!] Something went wrong: length of code `$code` is > $MaxHuffmanBits.";
  2505. }
  2506. if (exists($tree->{$code})) {
  2507. my $sym = $tree->{$code};
  2508. if ($sym == $eob) { # end of block marker
  2509. $VERBOSE && say STDERR "EOB detected: $sym";
  2510. last;
  2511. }
  2512. push @zrle, $sym;
  2513. $code = '';
  2514. if (--$decoded <= 0) {
  2515. if (++$sel_idx <= $#$sels) {
  2516. $tree = $huffman_trees[$sels->[$sel_idx]];
  2517. }
  2518. else {
  2519. confess "No more selectors"; # should not happen
  2520. }
  2521. $decoded = 50;
  2522. }
  2523. }
  2524. }
  2525. my @mtf = reverse @{zrle_decode([reverse @zrle])};
  2526. my $bwt = symbols2string mtf_decode(\@mtf, \@alphabet);
  2527. my $rle4 = string2symbols bwt_decode($bwt, $bwt_idx);
  2528. my $data = rle4_decode($rle4);
  2529. my $dec = symbols2string($data);
  2530. my $new_crc32 = oct('0b' . int2bits_lsb(crc32(pack('b*', unpack('B*', $dec))), 32));
  2531. $VERBOSE && say STDERR "Computed CRC32: $new_crc32";
  2532. if ($crc32 != $new_crc32) {
  2533. confess "CRC32 error: $crc32 (stored) != $new_crc32 (actual)";
  2534. }
  2535. $decompressed .= $dec;
  2536. }
  2537. elsif ($block_magic eq "\27rE8P\x90") { # BlockFooter
  2538. $VERBOSE && say STDERR "Block footer detected";
  2539. my $stored_stream_crc32 = bits2int($fh, 32, \$buffer);
  2540. $VERBOSE && say STDERR "Stream CRC: $stored_stream_crc32";
  2541. if ($stored_stream_crc32 != $stream_crc32) {
  2542. confess "Stream CRC32 error: $stored_stream_crc32 (stored) != $stream_crc32 (actual)";
  2543. }
  2544. $buffer = '';
  2545. last;
  2546. }
  2547. else {
  2548. confess "Unknown block magic: $block_magic";
  2549. }
  2550. }
  2551. $VERBOSE && say STDERR "End of container";
  2552. }
  2553. return $decompressed;
  2554. }
  2555. ########################################
  2556. # GZIP compressor
  2557. ########################################
  2558. sub _code_length_encoding ($dict) {
  2559. my @lengths;
  2560. foreach my $symbol (0 .. max(keys %$dict) // 0) {
  2561. if (exists($dict->{$symbol})) {
  2562. push @lengths, length($dict->{$symbol});
  2563. }
  2564. else {
  2565. push @lengths, 0;
  2566. }
  2567. }
  2568. my $size = scalar(@lengths);
  2569. my $rl = run_length(\@lengths);
  2570. my $offset_bits = '';
  2571. my @CL_symbols;
  2572. foreach my $pair (@$rl) {
  2573. my ($v, $run) = @$pair;
  2574. while ($v == 0 and $run >= 3) {
  2575. if ($run >= 11) {
  2576. push @CL_symbols, 18;
  2577. $run -= 11;
  2578. $offset_bits .= int2bits_lsb(min($run, 127), 7);
  2579. $run -= 127;
  2580. }
  2581. if ($run >= 3 and $run < 11) {
  2582. push @CL_symbols, 17;
  2583. $run -= 3;
  2584. $offset_bits .= int2bits_lsb(min($run, 7), 3);
  2585. $run -= 7;
  2586. }
  2587. }
  2588. if ($v == 0) {
  2589. push(@CL_symbols, (0) x $run) if ($run > 0);
  2590. next;
  2591. }
  2592. push @CL_symbols, $v;
  2593. $run -= 1;
  2594. while ($run >= 3) {
  2595. push @CL_symbols, 16;
  2596. $run -= 3;
  2597. $offset_bits .= int2bits_lsb(min($run, 3), 2);
  2598. $run -= 3;
  2599. }
  2600. push(@CL_symbols, ($v) x $run) if ($run > 0);
  2601. }
  2602. return (\@CL_symbols, $size, $offset_bits);
  2603. }
  2604. sub _cl_encoded_bitstring ($cl_dict, $cl_symbols, $offset_bits) {
  2605. my $bitstring = '';
  2606. foreach my $cl_symbol (@$cl_symbols) {
  2607. $bitstring .= $cl_dict->{$cl_symbol};
  2608. if ($cl_symbol == 16) {
  2609. $bitstring .= substr($offset_bits, 0, 2, '');
  2610. }
  2611. elsif ($cl_symbol == 17) {
  2612. $bitstring .= substr($offset_bits, 0, 3, '');
  2613. }
  2614. elsif ($cl_symbol == 18) {
  2615. $bitstring .= substr($offset_bits, 0, 7, '');
  2616. }
  2617. }
  2618. return $bitstring;
  2619. }
  2620. sub _create_cl_dictionary (@cl_symbols) {
  2621. my @keys;
  2622. my $freq = frequencies(\@cl_symbols);
  2623. while (1) {
  2624. my ($cl_dict) = huffman_from_freq($freq);
  2625. # The CL codes must have at most 7 bits
  2626. return $cl_dict if all { length($_) <= 7 } values %$cl_dict;
  2627. if (scalar(@keys) == 0) {
  2628. @keys = sort { $freq->{$b} <=> $freq->{$a} } keys %$freq;
  2629. }
  2630. # Scale down the frequencies and try again
  2631. foreach my $k (@keys) {
  2632. if ($freq->{$k} > 1) {
  2633. $freq->{$k} >>= 1;
  2634. }
  2635. else {
  2636. last;
  2637. }
  2638. }
  2639. }
  2640. }
  2641. sub _create_block_type_2 ($literals, $distances, $lengths) {
  2642. state $deflate_tables = [make_deflate_tables()];
  2643. my ($DISTANCE_SYMBOLS, $LENGTH_SYMBOLS, $LENGTH_INDICES) = @$deflate_tables;
  2644. my @CL_order = (16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15);
  2645. my $bitstring = '01';
  2646. my @len_symbols;
  2647. my @dist_symbols;
  2648. my $offset_bits = '';
  2649. foreach my $k (0 .. $#$literals) {
  2650. if ($lengths->[$k] == 0) {
  2651. push @len_symbols, $literals->[$k];
  2652. next;
  2653. }
  2654. my $len = $lengths->[$k];
  2655. my $dist = $distances->[$k];
  2656. {
  2657. my $len_idx = $LENGTH_INDICES->[$len];
  2658. my ($min, $bits) = @{$LENGTH_SYMBOLS->[$len_idx]};
  2659. push @len_symbols, [$len_idx + 256 - 1, $bits];
  2660. $offset_bits .= int2bits_lsb($len - $min, $bits) if ($bits > 0);
  2661. }
  2662. {
  2663. my $dist_idx = find_deflate_index($dist, $DISTANCE_SYMBOLS);
  2664. my ($min, $bits) = @{$DISTANCE_SYMBOLS->[$dist_idx]};
  2665. push @dist_symbols, [$dist_idx - 1, $bits];
  2666. $offset_bits .= int2bits_lsb($dist - $min, $bits) if ($bits > 0);
  2667. }
  2668. }
  2669. push @len_symbols, 256; # end-of-block marker
  2670. my ($dict) = huffman_from_symbols([map { ref($_) eq 'ARRAY' ? $_->[0] : $_ } @len_symbols]);
  2671. my ($dist_dict) = huffman_from_symbols([map { $_->[0] } @dist_symbols]);
  2672. my ($LL_code_lengths, $LL_cl_len, $LL_offset_bits) = _code_length_encoding($dict);
  2673. my ($distance_code_lengths, $distance_cl_len, $distance_offset_bits) = _code_length_encoding($dist_dict);
  2674. my $cl_dict = _create_cl_dictionary(@$LL_code_lengths, @$distance_code_lengths);
  2675. my @CL_code_lenghts;
  2676. foreach my $symbol (0 .. 18) {
  2677. if (exists($cl_dict->{$symbol})) {
  2678. push @CL_code_lenghts, length($cl_dict->{$symbol});
  2679. }
  2680. else {
  2681. push @CL_code_lenghts, 0;
  2682. }
  2683. }
  2684. # Put the CL codes in the required order
  2685. @CL_code_lenghts = @CL_code_lenghts[@CL_order];
  2686. while (scalar(@CL_code_lenghts) > 4 and $CL_code_lenghts[-1] == 0) {
  2687. pop @CL_code_lenghts;
  2688. }
  2689. my $CL_code_lengths_bitstring = join('', map { int2bits_lsb($_, 3) } @CL_code_lenghts);
  2690. my $LL_code_lengths_bitstring = _cl_encoded_bitstring($cl_dict, $LL_code_lengths, $LL_offset_bits);
  2691. my $distance_code_lengths_bitstring = _cl_encoded_bitstring($cl_dict, $distance_code_lengths, $distance_offset_bits);
  2692. # (5 bits) HLIT = (number of LL code entries present) - 257
  2693. my $HLIT = $LL_cl_len - 257;
  2694. # (5 bits) HDIST = (number of distance code entries present) - 1
  2695. my $HDIST = $distance_cl_len - 1;
  2696. # (4 bits) HCLEN = (number of CL code entries present) - 4
  2697. my $HCLEN = scalar(@CL_code_lenghts) - 4;
  2698. $bitstring .= int2bits_lsb($HLIT, 5);
  2699. $bitstring .= int2bits_lsb($HDIST, 5);
  2700. $bitstring .= int2bits_lsb($HCLEN, 4);
  2701. $bitstring .= $CL_code_lengths_bitstring;
  2702. $bitstring .= $LL_code_lengths_bitstring;
  2703. $bitstring .= $distance_code_lengths_bitstring;
  2704. foreach my $symbol (@len_symbols) {
  2705. if (ref($symbol) eq 'ARRAY') {
  2706. my ($len, $len_offset) = @$symbol;
  2707. $bitstring .= $dict->{$len};
  2708. $bitstring .= substr($offset_bits, 0, $len_offset, '') if ($len_offset > 0);
  2709. my ($dist, $dist_offset) = @{shift(@dist_symbols)};
  2710. $bitstring .= $dist_dict->{$dist};
  2711. $bitstring .= substr($offset_bits, 0, $dist_offset, '') if ($dist_offset > 0);
  2712. }
  2713. else {
  2714. $bitstring .= $dict->{$symbol};
  2715. }
  2716. }
  2717. return $bitstring;
  2718. }
  2719. sub _create_block_type_1 ($literals, $distances, $lengths) {
  2720. state $deflate_tables = [make_deflate_tables()];
  2721. my ($DISTANCE_SYMBOLS, $LENGTH_SYMBOLS, $LENGTH_INDICES) = @$deflate_tables;
  2722. state $dict;
  2723. state $dist_dict;
  2724. if (!defined($dict)) {
  2725. my @code_lengths = (0) x 288;
  2726. foreach my $i (0 .. 143) {
  2727. $code_lengths[$i] = 8;
  2728. }
  2729. foreach my $i (144 .. 255) {
  2730. $code_lengths[$i] = 9;
  2731. }
  2732. foreach my $i (256 .. 279) {
  2733. $code_lengths[$i] = 7;
  2734. }
  2735. foreach my $i (280 .. 287) {
  2736. $code_lengths[$i] = 8;
  2737. }
  2738. ($dict) = huffman_from_code_lengths(\@code_lengths);
  2739. ($dist_dict) = huffman_from_code_lengths([(5) x 32]);
  2740. }
  2741. my $bitstring = '10';
  2742. foreach my $k (0 .. $#$literals) {
  2743. if ($lengths->[$k] == 0) {
  2744. $bitstring .= $dict->{$literals->[$k]};
  2745. next;
  2746. }
  2747. my $len = $lengths->[$k];
  2748. my $dist = $distances->[$k];
  2749. {
  2750. my $len_idx = $LENGTH_INDICES->[$len];
  2751. my ($min, $bits) = @{$LENGTH_SYMBOLS->[$len_idx]};
  2752. $bitstring .= $dict->{$len_idx + 256 - 1};
  2753. $bitstring .= int2bits_lsb($len - $min, $bits) if ($bits > 0);
  2754. }
  2755. {
  2756. my $dist_idx = find_deflate_index($dist, $DISTANCE_SYMBOLS);
  2757. my ($min, $bits) = @{$DISTANCE_SYMBOLS->[$dist_idx]};
  2758. $bitstring .= $dist_dict->{$dist_idx - 1};
  2759. $bitstring .= int2bits_lsb($dist - $min, $bits) if ($bits > 0);
  2760. }
  2761. }
  2762. $bitstring .= $dict->{256}; # end-of-block symbol
  2763. return $bitstring;
  2764. }
  2765. sub _create_block_type_0($chunk) {
  2766. my $chunk_len = length($chunk);
  2767. my $len = int2bits_lsb($chunk_len, 16);
  2768. my $nlen = int2bits_lsb((~$chunk_len) & 0xffff, 16);
  2769. $len . $nlen;
  2770. }
  2771. sub gzip_compress ($in_fh, $lzss_encoding_sub = \&lzss_encode) {
  2772. if (ref($in_fh) eq '') {
  2773. open(my $fh2, '<:raw', \$in_fh) or confess "error: $!";
  2774. return __SUB__->($fh2);
  2775. }
  2776. my $compressed = '';
  2777. open my $out_fh, '>:raw', \$compressed;
  2778. local $LZ_MIN_LEN = 4 if ($LZ_MIN_LEN < 4); # minimum match length in LZ parsing
  2779. local $LZ_MAX_LEN = 258; # maximum match length in LZ parsing
  2780. local $LZ_MAX_DIST = (1 << 15) - 1; # maximum allowed back-reference distance in LZ parsing
  2781. state $MAGIC = pack('C*', 0x1f, 0x8b); # magic MIME type
  2782. state $CM = chr(0x08); # 0x08 = DEFLATE
  2783. state $FLAGS = chr(0x00); # flags
  2784. state $MTIME = pack('C*', (0x00) x 4); # modification time
  2785. state $XFLAGS = chr(0x00); # extra flags
  2786. state $OS = chr(0x03); # 0x03 = Unix
  2787. print $out_fh $MAGIC, $CM, $FLAGS, $MTIME, $XFLAGS, $OS;
  2788. my $total_length = 0;
  2789. my $crc32 = 0;
  2790. my $bitstring = '';
  2791. if (eof($in_fh)) { # empty file
  2792. $bitstring = '1' . '10' . '0000000';
  2793. }
  2794. state $CHUNK_SIZE = (1 << 15) - 1;
  2795. while (read($in_fh, (my $chunk), $CHUNK_SIZE)) {
  2796. $crc32 = crc32($chunk, $crc32);
  2797. $total_length += length($chunk);
  2798. my ($literals, $distances, $lengths) = $lzss_encoding_sub->($chunk);
  2799. $bitstring .= eof($in_fh) ? '1' : '0';
  2800. my $bt1_bitstring = _create_block_type_1($literals, $distances, $lengths);
  2801. # When block type 1 is larger than the input, then we have random uncompressible data: use block type 0
  2802. if ((length($bt1_bitstring) >> 3) > length($chunk) + 5) {
  2803. $VERBOSE && say STDERR ":: Using block type: 0";
  2804. $bitstring .= '00';
  2805. print $out_fh pack('b*', $bitstring); # pads to a byte
  2806. print $out_fh pack('b*', _create_block_type_0($chunk));
  2807. print $out_fh $chunk;
  2808. $bitstring = '';
  2809. next;
  2810. }
  2811. my $bt2_bitstring = _create_block_type_2($literals, $distances, $lengths);
  2812. # When block type 2 is larger than block type 1, then we may have very small data
  2813. if (length($bt2_bitstring) > length($bt1_bitstring)) {
  2814. $VERBOSE && say STDERR ":: Using block type: 1";
  2815. $bitstring .= $bt1_bitstring;
  2816. }
  2817. else {
  2818. $VERBOSE && say STDERR ":: Using block type: 2";
  2819. $bitstring .= $bt2_bitstring;
  2820. }
  2821. print $out_fh pack('b*', substr($bitstring, 0, length($bitstring) - (length($bitstring) % 8), ''));
  2822. }
  2823. if ($bitstring ne '') {
  2824. print $out_fh pack('b*', $bitstring);
  2825. }
  2826. print $out_fh pack('b*', int2bits_lsb($crc32, 32));
  2827. print $out_fh pack('b*', int2bits_lsb($total_length, 32));
  2828. return $compressed;
  2829. }
  2830. ###################
  2831. # GZIP DECOMPRESSOR
  2832. ###################
  2833. sub _extract_block_type_0 ($in_fh, $buffer) {
  2834. my $len = bits2int_lsb($in_fh, 16, $buffer);
  2835. my $nlen = bits2int_lsb($in_fh, 16, $buffer);
  2836. my $expected_nlen = (~$len) & 0xffff;
  2837. if ($expected_nlen != $nlen) {
  2838. confess "[!] The ~length value is not correct: $nlen (actual) != $expected_nlen (expected)";
  2839. }
  2840. else {
  2841. $VERBOSE && print STDERR ":: Chunk length: $len\n";
  2842. }
  2843. read($in_fh, (my $chunk), $len) // confess "Read error: $!";
  2844. return $chunk;
  2845. }
  2846. sub _deflate_decode_huffman($in_fh, $buffer, $rev_dict, $dist_rev_dict, $search_window) {
  2847. state $deflate_tables = [make_deflate_tables()];
  2848. my ($DISTANCE_SYMBOLS, $LENGTH_SYMBOLS, $LENGTH_INDICES) = @$deflate_tables;
  2849. my $data = '';
  2850. my $code = '';
  2851. my $max_ll_code_len = max(map { length($_) } keys %$rev_dict);
  2852. my $max_dist_code_len = max(map { length($_) } keys %$dist_rev_dict);
  2853. while (1) {
  2854. $code .= read_bit_lsb($in_fh, $buffer);
  2855. if (length($code) > $max_ll_code_len) {
  2856. confess "[!] Something went wrong: length of LL code `$code` is > $max_ll_code_len.";
  2857. }
  2858. if (exists($rev_dict->{$code})) {
  2859. my $symbol = $rev_dict->{$code};
  2860. if ($symbol <= 255) {
  2861. $data .= chr($symbol);
  2862. $$search_window .= chr($symbol);
  2863. }
  2864. elsif ($symbol == 256) { # end-of-block marker
  2865. $code = '';
  2866. last;
  2867. }
  2868. else { # LZSS decoding
  2869. my ($length, $LL_bits) = @{$LENGTH_SYMBOLS->[$symbol - 256 + 1]};
  2870. $length += bits2int_lsb($in_fh, $LL_bits, $buffer) if ($LL_bits > 0);
  2871. my $dist_code = '';
  2872. while (1) {
  2873. $dist_code .= read_bit_lsb($in_fh, $buffer);
  2874. if (length($dist_code) > $max_dist_code_len) {
  2875. confess "[!] Something went wrong: length of distance code `$dist_code` is > $max_dist_code_len.";
  2876. }
  2877. if (exists($dist_rev_dict->{$dist_code})) {
  2878. last;
  2879. }
  2880. }
  2881. my ($dist, $dist_bits) = @{$DISTANCE_SYMBOLS->[$dist_rev_dict->{$dist_code} + 1]};
  2882. $dist += bits2int_lsb($in_fh, $dist_bits, $buffer) if ($dist_bits > 0);
  2883. if ($dist == 1) {
  2884. $$search_window .= substr($$search_window, -1) x $length;
  2885. }
  2886. elsif ($dist >= $length) { # non-overlapping matches
  2887. $$search_window .= substr($$search_window, length($$search_window) - $dist, $length);
  2888. }
  2889. else { # overlapping matches
  2890. foreach my $i (1 .. $length) {
  2891. $$search_window .= substr($$search_window, length($$search_window) - $dist, 1);
  2892. }
  2893. }
  2894. $data .= substr($$search_window, -$length);
  2895. }
  2896. $code = '';
  2897. }
  2898. }
  2899. if ($code ne '') {
  2900. confess "[!] Something went wrong: code `$code` is not empty!";
  2901. }
  2902. return $data;
  2903. }
  2904. sub _extract_block_type_1 ($in_fh, $buffer, $search_window) {
  2905. state $rev_dict;
  2906. state $dist_rev_dict;
  2907. if (!defined($rev_dict)) {
  2908. my @code_lengths = (0) x 288;
  2909. foreach my $i (0 .. 143) {
  2910. $code_lengths[$i] = 8;
  2911. }
  2912. foreach my $i (144 .. 255) {
  2913. $code_lengths[$i] = 9;
  2914. }
  2915. foreach my $i (256 .. 279) {
  2916. $code_lengths[$i] = 7;
  2917. }
  2918. foreach my $i (280 .. 287) {
  2919. $code_lengths[$i] = 8;
  2920. }
  2921. (undef, $rev_dict) = huffman_from_code_lengths(\@code_lengths);
  2922. (undef, $dist_rev_dict) = huffman_from_code_lengths([(5) x 32]);
  2923. }
  2924. _deflate_decode_huffman($in_fh, $buffer, $rev_dict, $dist_rev_dict, $search_window);
  2925. }
  2926. sub _decode_CL_lengths($in_fh, $buffer, $CL_rev_dict, $size) {
  2927. my @lengths;
  2928. my $code = '';
  2929. while (1) {
  2930. $code .= read_bit_lsb($in_fh, $buffer);
  2931. if (length($code) > 7) {
  2932. confess "[!] Something went wrong: length of CL code `$code` is > 7.";
  2933. }
  2934. if (exists($CL_rev_dict->{$code})) {
  2935. my $CL_symbol = $CL_rev_dict->{$code};
  2936. if ($CL_symbol <= 15) {
  2937. push @lengths, $CL_symbol;
  2938. }
  2939. elsif ($CL_symbol == 16) {
  2940. push @lengths, ($lengths[-1]) x (3 + bits2int_lsb($in_fh, 2, $buffer));
  2941. }
  2942. elsif ($CL_symbol == 17) {
  2943. push @lengths, (0) x (3 + bits2int_lsb($in_fh, 3, $buffer));
  2944. }
  2945. elsif ($CL_symbol == 18) {
  2946. push @lengths, (0) x (11 + bits2int_lsb($in_fh, 7, $buffer));
  2947. }
  2948. else {
  2949. confess "Unknown CL symbol: $CL_symbol";
  2950. }
  2951. $code = '';
  2952. last if (scalar(@lengths) >= $size);
  2953. }
  2954. }
  2955. if (scalar(@lengths) != $size) {
  2956. confess "Something went wrong: size $size (expected) != ", scalar(@lengths);
  2957. }
  2958. if ($code ne '') {
  2959. confess "Something went wrong: code `$code` is not empty!";
  2960. }
  2961. return @lengths;
  2962. }
  2963. sub _extract_block_type_2 ($in_fh, $buffer, $search_window) {
  2964. # (5 bits) HLIT = (number of LL code entries present) - 257
  2965. my $HLIT = bits2int_lsb($in_fh, 5, $buffer) + 257;
  2966. # (5 bits) HDIST = (number of distance code entries present) - 1
  2967. my $HDIST = bits2int_lsb($in_fh, 5, $buffer) + 1;
  2968. # (4 bits) HCLEN = (number of CL code entries present) - 4
  2969. my $HCLEN = bits2int_lsb($in_fh, 4, $buffer) + 4;
  2970. $VERBOSE && say STDERR ":: Number of LL codes: $HLIT";
  2971. $VERBOSE && say STDERR ":: Number of dist codes: $HDIST";
  2972. $VERBOSE && say STDERR ":: Number of CL codes: $HCLEN";
  2973. my @CL_code_lenghts = (0) x 19;
  2974. my @CL_order = (16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15);
  2975. foreach my $i (0 .. $HCLEN - 1) {
  2976. $CL_code_lenghts[$CL_order[$i]] = bits2int_lsb($in_fh, 3, $buffer);
  2977. }
  2978. $VERBOSE && say STDERR ":: CL code lengths: @CL_code_lenghts";
  2979. my (undef, $CL_rev_dict) = huffman_from_code_lengths(\@CL_code_lenghts);
  2980. my @LL_CL_lengths = _decode_CL_lengths($in_fh, $buffer, $CL_rev_dict, $HLIT);
  2981. my @dist_CL_lengths = _decode_CL_lengths($in_fh, $buffer, $CL_rev_dict, $HDIST);
  2982. my (undef, $LL_rev_dict) = huffman_from_code_lengths(\@LL_CL_lengths);
  2983. my (undef, $dist_rev_dict) = huffman_from_code_lengths(\@dist_CL_lengths);
  2984. _deflate_decode_huffman($in_fh, $buffer, $LL_rev_dict, $dist_rev_dict, $search_window);
  2985. }
  2986. sub gzip_decompress ($in_fh) {
  2987. if (ref($in_fh) eq '') {
  2988. open(my $fh2, '<:raw', \$in_fh) or confess "error: $!";
  2989. return __SUB__->($fh2);
  2990. }
  2991. my $decompressed = '';
  2992. open my $out_fh, '>:raw', \$decompressed;
  2993. local $LZ_MIN_LEN = 4 if ($LZ_MIN_LEN < 4); # minimum match length in LZ parsing
  2994. local $LZ_MAX_LEN = 258; # maximum match length in LZ parsing
  2995. local $LZ_MAX_DIST = (1 << 15) - 1; # maximum allowed back-reference distance in LZ parsing
  2996. my $MAGIC = (getc($in_fh) // confess "error") . (getc($in_fh) // confess "error");
  2997. if ($MAGIC ne pack('C*', 0x1f, 0x8b)) {
  2998. confess "Not a valid Gzip container!";
  2999. }
  3000. my $CM = getc($in_fh) // confess "error"; # 0x08 = DEFLATE
  3001. my $FLAGS = ord(getc($in_fh) // confess "error"); # flags
  3002. my $MTIME = join('', map { getc($in_fh) // confess "error" } 1 .. 4); # modification time
  3003. my $XFLAGS = getc($in_fh) // confess "error"; # extra flags
  3004. my $OS = getc($in_fh) // confess "error"; # 0x03 = Unix
  3005. if ($CM ne chr(0x08)) {
  3006. confess "Only DEFLATE compression method is supported (0x08)! Got: 0x", sprintf('%02x', ord($CM));
  3007. }
  3008. # Reference:
  3009. # https://web.archive.org/web/20240221024029/https://forensics.wiki/gzip/
  3010. my $has_filename = 0;
  3011. my $has_comment = 0;
  3012. my $has_header_checksum = 0;
  3013. my $has_extra_fields = 0;
  3014. if ($FLAGS & 0x08) {
  3015. $has_filename = 1;
  3016. }
  3017. if ($FLAGS & 0x10) {
  3018. $has_comment = 1;
  3019. }
  3020. if ($FLAGS & 0x02) {
  3021. $has_header_checksum = 1;
  3022. }
  3023. if ($FLAGS & 0x04) {
  3024. $has_extra_fields = 1;
  3025. }
  3026. if ($has_extra_fields) {
  3027. my $size = bytes2int_lsb($in_fh, 2);
  3028. read($in_fh, (my $extra_field_data), $size) // confess "can't read extra field data: $!";
  3029. $VERBOSE && say STDERR ":: Extra field data: $extra_field_data";
  3030. }
  3031. if ($has_filename) {
  3032. my $filename = read_null_terminated($in_fh); # filename
  3033. $VERBOSE && say STDERR ":: Filename: $filename";
  3034. }
  3035. if ($has_comment) {
  3036. my $comment = read_null_terminated($in_fh); # comment
  3037. $VERBOSE && say STDERR ":: Comment: $comment";
  3038. }
  3039. # TODO: verify the header checksum
  3040. if ($has_header_checksum) {
  3041. my $header_checksum = bytes2int_lsb($in_fh, 2);
  3042. $VERBOSE && say STDERR ":: Header checksum: $header_checksum";
  3043. }
  3044. my $crc32 = 0;
  3045. my $actual_length = 0;
  3046. my $buffer = '';
  3047. my $search_window = '';
  3048. my $window_size = $Compression::Util::LZ_MAX_DIST;
  3049. while (1) {
  3050. my $is_last = read_bit_lsb($in_fh, \$buffer);
  3051. my $block_type = bits2int_lsb($in_fh, 2, \$buffer);
  3052. my $chunk = '';
  3053. if ($block_type == 0) {
  3054. $VERBOSE && say STDERR "\n:: Extracting block of type 0";
  3055. $buffer = ''; # pad to a byte
  3056. $chunk = _extract_block_type_0($in_fh, \$buffer);
  3057. $search_window .= $chunk;
  3058. }
  3059. elsif ($block_type == 1) {
  3060. $VERBOSE && say STDERR "\n:: Extracting block of type 1";
  3061. $chunk = _extract_block_type_1($in_fh, \$buffer, \$search_window);
  3062. }
  3063. elsif ($block_type == 2) {
  3064. $VERBOSE && say STDERR "\n:: Extracting block of type 2";
  3065. $chunk = _extract_block_type_2($in_fh, \$buffer, \$search_window);
  3066. }
  3067. else {
  3068. confess "[!] Unknown block of type: $block_type";
  3069. }
  3070. print $out_fh $chunk;
  3071. $crc32 = crc32($chunk, $crc32);
  3072. $actual_length += length($chunk);
  3073. $search_window = substr($search_window, -$window_size) if (length($search_window) > 2 * $window_size);
  3074. last if $is_last;
  3075. }
  3076. $buffer = ''; # discard any padding bits
  3077. my $stored_crc32 = bits2int_lsb($in_fh, 32, \$buffer);
  3078. my $actual_crc32 = $crc32;
  3079. if ($stored_crc32 != $actual_crc32) {
  3080. confess "[!] The CRC32 does not match: $actual_crc32 (actual) != $stored_crc32 (stored)";
  3081. }
  3082. else {
  3083. $VERBOSE && print STDERR ":: CRC32 value: $actual_crc32\n";
  3084. }
  3085. my $stored_length = bits2int_lsb($in_fh, 32, \$buffer);
  3086. if ($stored_length != $actual_length) {
  3087. confess "[!] The length does not match: $actual_length (actual) != $stored_length (stored)";
  3088. }
  3089. else {
  3090. $VERBOSE && print STDERR ":: Total length: $actual_length\n";
  3091. }
  3092. if (eof($in_fh)) {
  3093. $VERBOSE && print STDERR "\n:: Reached the end of the file.\n";
  3094. }
  3095. else {
  3096. $VERBOSE && print STDERR "\n:: There is something else in the container! Trying to recurse!\n\n";
  3097. return ($decompressed . __SUB__->($in_fh));
  3098. }
  3099. return $decompressed;
  3100. }
  3101. ###############################
  3102. # LZ4 compressor
  3103. ###############################
  3104. sub lz4_compress($fh, $lzss_encoding_sub = \&lzss_encode) {
  3105. if (ref($fh) eq '') {
  3106. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  3107. return __SUB__->($fh2);
  3108. }
  3109. my $compressed = '';
  3110. $compressed .= int2bytes_lsb(0x184D2204, 4); # LZ4 magic number
  3111. my $fd = ''; # frame description
  3112. $fd .= chr(0b01_10_00_00); # flags (FLG)
  3113. $fd .= chr(0b0_111_0000); # block description (BD)
  3114. $compressed .= $fd;
  3115. $compressed .= chr(115); # header checksum
  3116. state $CHUNK_SIZE = 1 << 17;
  3117. while (read($fh, (my $chunk), $CHUNK_SIZE)) {
  3118. my ($literals, $distances, $lengths) = do {
  3119. local $LZ_MIN_LEN = 4 if ($LZ_MIN_LEN < 4);
  3120. local $LZ_MAX_LEN = ~0;
  3121. local $LZ_MAX_DIST = (1 << 16) - 1;
  3122. $lzss_encoding_sub->(substr($chunk, 0, -5));
  3123. };
  3124. # The last 5 bytes of each block must be literals
  3125. # https://github.com/lz4/lz4/issues/1495
  3126. push @$literals, unpack('C*', substr($chunk, -5));
  3127. my $literals_end = $#{$literals};
  3128. my $block = '';
  3129. for (my $i = 0 ; $i <= $literals_end ; ++$i) {
  3130. my @uncompressed;
  3131. while ($i <= $literals_end and defined($literals->[$i])) {
  3132. push @uncompressed, $literals->[$i];
  3133. ++$i;
  3134. }
  3135. my $literals_string = pack('C*', @uncompressed);
  3136. my $literals_length = scalar(@uncompressed);
  3137. my $match_len = $lengths->[$i] ? ($lengths->[$i] - 4) : 0;
  3138. $block .= chr((($literals_length >= 15 ? 15 : $literals_length) << 4) | ($match_len >= 15 ? 15 : $match_len));
  3139. $literals_length -= 15;
  3140. $match_len -= 15;
  3141. while ($literals_length >= 0) {
  3142. $block .= ($literals_length >= 255 ? "\xff" : chr($literals_length));
  3143. $literals_length -= 255;
  3144. }
  3145. $block .= $literals_string;
  3146. my $dist = $distances->[$i] // last;
  3147. $block .= pack('b*', scalar reverse sprintf('%016b', $dist));
  3148. while ($match_len >= 0) {
  3149. $block .= ($match_len >= 255 ? "\xff" : chr($match_len));
  3150. $match_len -= 255;
  3151. }
  3152. }
  3153. if ($block ne '') {
  3154. $compressed .= int2bytes_lsb(length($block), 4);
  3155. $compressed .= $block;
  3156. }
  3157. }
  3158. $compressed .= int2bytes_lsb(0x00000000, 4); # EndMark
  3159. return $compressed;
  3160. }
  3161. ###############################
  3162. # LZ4 decompressor
  3163. ###############################
  3164. sub lz4_decompress($fh) {
  3165. if (ref($fh) eq '') {
  3166. open(my $fh2, '<:raw', \$fh) or confess "error: $!";
  3167. return __SUB__->($fh2);
  3168. }
  3169. my $decompressed = '';
  3170. while (!eof($fh)) {
  3171. bytes2int_lsb($fh, 4) == 0x184D2204 or confess "Incorrect LZ4 Frame magic number";
  3172. my $FLG = ord(getc($fh));
  3173. my $BD = ord(getc($fh));
  3174. my $version = $FLG & 0b11_00_00_00;
  3175. my $B_indep = $FLG & 0b00_10_00_00;
  3176. my $B_checksum = $FLG & 0b00_01_00_00;
  3177. my $C_size = $FLG & 0b00_00_10_00;
  3178. my $C_checksum = $FLG & 0b00_00_01_00;
  3179. my $DictID = $FLG & 0b00_00_00_01;
  3180. my $Block_MaxSize = $BD & 0b0_111_0000;
  3181. $VERBOSE && say STDERR "Maximum block size: $Block_MaxSize";
  3182. if ($version != 0b01_00_00_00) {
  3183. confess "Error: Invalid version number";
  3184. }
  3185. if ($C_size) {
  3186. my $content_size = bytes2int_lsb($fh, 8);
  3187. $VERBOSE && say STDERR "Content size: ", $content_size;
  3188. }
  3189. if ($DictID) {
  3190. my $dict_id = bytes2int_lsb($fh, 4);
  3191. $VERBOSE && say STDERR "Dictionary ID: ", $dict_id;
  3192. }
  3193. my $header_checksum = ord(getc($fh));
  3194. # TODO: compute and verify the header checksum
  3195. $VERBOSE && say STDERR "Header checksum: ", $header_checksum;
  3196. my $decoded = '';
  3197. while (!eof($fh)) {
  3198. my $block_size = bytes2int_lsb($fh, 4);
  3199. if ($block_size == 0x00000000) { # signifies an EndMark
  3200. $VERBOSE && say STDERR "Block size == 0";
  3201. last;
  3202. }
  3203. $VERBOSE && say STDERR "Block size: $block_size";
  3204. if ($block_size >> 31) {
  3205. $VERBOSE && say STDERR "Highest bit set: ", $block_size;
  3206. $block_size &= ((1 << 31) - 1);
  3207. $VERBOSE && say STDERR "Block size: ", $block_size;
  3208. my $uncompressed = '';
  3209. read($fh, $uncompressed, $block_size);
  3210. $decoded .= $uncompressed;
  3211. }
  3212. else {
  3213. my $compressed = '';
  3214. read($fh, $compressed, $block_size);
  3215. while ($compressed ne '') {
  3216. my $len_byte = ord(substr($compressed, 0, 1, ''));
  3217. my $literals_length = $len_byte >> 4;
  3218. my $match_len = $len_byte & 0b1111;
  3219. ## say STDERR "Literal: ", $literals_length;
  3220. ## say STDERR "Match len: ", $match_len;
  3221. if ($literals_length == 15) {
  3222. while (1) {
  3223. my $byte_len = ord(substr($compressed, 0, 1, ''));
  3224. $literals_length += $byte_len;
  3225. last if $byte_len != 255;
  3226. }
  3227. }
  3228. ## say STDERR "Total literals length: ", $literals_length;
  3229. my $literals = '';
  3230. if ($literals_length > 0) {
  3231. $literals = substr($compressed, 0, $literals_length, '');
  3232. }
  3233. if ($compressed eq '') { # end of block
  3234. $decoded .= $literals;
  3235. last;
  3236. }
  3237. my $offset = oct('0b' . reverse unpack('b16', substr($compressed, 0, 2, '')));
  3238. if ($offset == 0) {
  3239. confess "Corrupted block";
  3240. }
  3241. ## say STDERR "Offset: $offset";
  3242. if ($match_len == 15) {
  3243. while (1) {
  3244. my $byte_len = ord(substr($compressed, 0, 1, ''));
  3245. $match_len += $byte_len;
  3246. last if $byte_len != 255;
  3247. }
  3248. }
  3249. $decoded .= $literals;
  3250. $match_len += 4;
  3251. ## say STDERR "Total match len: $match_len\n";
  3252. if ($offset >= $match_len) { # non-overlapping matches
  3253. $decoded .= substr($decoded, length($decoded) - $offset, $match_len);
  3254. }
  3255. elsif ($offset == 1) {
  3256. $decoded .= substr($decoded, -1) x $match_len;
  3257. }
  3258. else { # overlapping matches
  3259. foreach my $i (1 .. $match_len) {
  3260. $decoded .= substr($decoded, length($decoded) - $offset, 1);
  3261. }
  3262. }
  3263. }
  3264. }
  3265. if ($B_checksum) {
  3266. my $content_checksum = bytes2int_lsb($fh, 4);
  3267. $VERBOSE && say STDERR "Block checksum: $content_checksum";
  3268. }
  3269. if ($B_indep) { # blocks are independent of each other
  3270. $decompressed .= $decoded;
  3271. $decoded = '';
  3272. }
  3273. elsif (length($decoded) > 2**16) { # blocks are dependent
  3274. $decompressed .= substr($decoded, 0, -(2**16), '');
  3275. }
  3276. }
  3277. # TODO: compute and verify checksum
  3278. if ($C_checksum) {
  3279. my $content_checksum = bytes2int_lsb($fh, 4);
  3280. $VERBOSE && say STDERR "Content checksum: $content_checksum";
  3281. }
  3282. $decompressed .= $decoded;
  3283. }
  3284. return $decompressed;
  3285. }
  3286. 1;
  3287. __END__
  3288. =encoding utf-8
  3289. =head1 NAME
  3290. Compression::Util - Implementation of various techniques used in data compression.
  3291. =head1 SYNOPSIS
  3292. use 5.036;
  3293. use Getopt::Std qw(getopts);
  3294. use Compression::Util qw(:all);
  3295. use constant {CHUNK_SIZE => 1 << 17};
  3296. local $Compression::Util::VERBOSE = 0;
  3297. getopts('d', \my %opts);
  3298. sub compress ($fh, $out_fh) {
  3299. while (read($fh, (my $chunk), CHUNK_SIZE)) {
  3300. print $out_fh bwt_compress($chunk);
  3301. }
  3302. }
  3303. sub decompress ($fh, $out_fh) {
  3304. while (!eof($fh)) {
  3305. print $out_fh bwt_decompress($fh);
  3306. }
  3307. }
  3308. $opts{d} ? decompress(\*STDIN, \*STDOUT) : compress(\*STDIN, \*STDOUT);
  3309. =head1 DESCRIPTION
  3310. B<Compression::Util> is a function-based module, implementing various techniques used in data compression, such as:
  3311. * Burrows-Wheeler transform
  3312. * Move-to-front transform
  3313. * Huffman Coding
  3314. * Arithmetic Coding (in fixed bits)
  3315. * Run-length encoding
  3316. * Fibonacci coding
  3317. * Elias gamma/omega coding
  3318. * Delta coding
  3319. * BWT-based (de)compression
  3320. * LZ77/LZSS (de)compression
  3321. * LZW (de)compression
  3322. * Bzip2 (de)compression
  3323. * Gzip (de)compression
  3324. * LZ4 (de)compression
  3325. The provided techniques can be easily combined in various ways to create powerful compressors, such as the Bzip2 compressor, which is a pipeline of the following methods:
  3326. 1. Run-length encoding (RLE4)
  3327. 2. Burrows-Wheeler transform (BWT)
  3328. 3. Move-to-front transform (MTF)
  3329. 4. Zero run-length encoding (ZRLE)
  3330. 5. Huffman coding
  3331. A simple BWT-based compression method (similar to Bzip2) is provided by the function C<bwt_compress()>, which can be explicitly implemented as:
  3332. use 5.036;
  3333. use Compression::Util qw(:all);
  3334. my $data = do { open my $fh, '<:raw', $^X; local $/; <$fh> };
  3335. my $rle4 = rle4_encode(string2symbols($data));
  3336. my ($bwt, $idx) = bwt_encode(symbols2string($rle4));
  3337. my ($mtf, $alphabet) = mtf_encode(string2symbols($bwt));
  3338. my $rle = zrle_encode($mtf);
  3339. my $enc = pack('N', $idx)
  3340. . encode_alphabet($alphabet)
  3341. . create_huffman_entry($rle);
  3342. say "Original size : ", length($data);
  3343. say "Compressed size: ", length($enc);
  3344. # Decompress the result
  3345. bwt_decompress($enc) eq $data or die "decompression error";
  3346. =head2 TERMINOLOGY
  3347. =head3 bit
  3348. A bit value is either C<1> or C<0>.
  3349. =head3 bitstring
  3350. A bitstring is a string containing only 1s and 0s.
  3351. =head3 byte
  3352. A byte value is an integer between C<0> and C<255>, inclusive.
  3353. =head3 string
  3354. A string means a binary (non-UTF*) string.
  3355. =head3 symbols
  3356. An array of symbols means an array of non-negative integer values.
  3357. =head3 filehandle
  3358. A filehandle is denoted by C<$fh>.
  3359. The encoding of file-handles must be set to C<:raw>.
  3360. =head1 PACKAGE VARIABLES
  3361. B<Compression::Util> provides the following package variables:
  3362. $Compression::Util::VERBOSE = 0; # true to enable verbose/debug mode
  3363. $Compression::Util::LZ_MIN_LEN = 4; # minimum match length in LZ parsing
  3364. $Compression::Util::LZ_MAX_LEN = 1 << 15; # maximum match length in LZ parsing
  3365. $Compression::Util::LZ_MAX_DIST = ~0; # maximum back-reference distance allowed
  3366. $Compression::Util::LZ_MAX_CHAIN_LEN = 32; # how many recent positions to remember for each match in LZ parsing
  3367. These package variables can also be imported as:
  3368. use Compression::Util qw(
  3369. $LZ_MIN_LEN
  3370. $LZ_MAX_LEN
  3371. $LZ_MAX_DIST
  3372. $LZ_MAX_CHAIN_LEN
  3373. );
  3374. =head2 $LZ_MIN_LEN
  3375. Minimum length of a match in LZ parsing. The value must be an integer greater than or equal to C<2>. Larger values will result in faster parsing, but lower compression ratio.
  3376. By default, C<$LZ_MIN_LEN> is set to C<4>.
  3377. B<NOTE:> for C<lzss_encode_fast()> is recommended to set C<$LZ_MIN_LEN = 5>, which will result in slightly better compression ratio.
  3378. =head2 $LZ_MAX_LEN
  3379. Maximum length of a match in LZ parsing. The value must be an integer greater than or equal to C<0>.
  3380. By default, C<$LZ_MAX_LEN> is set to C<32768>.
  3381. B<NOTE:> the functions C<lz77_encode()> and C<lzb_compress()> will ignore this value and will always use unlimited match lengths.
  3382. =head2 $LZ_MAX_DIST
  3383. Maximum back-reference distance allowed in LZ parsing. Smaller values will result in faster parsing, but lower compression ratio.
  3384. By default, the value is unlimited, meaning that arbitrarily large back-references will be generated.
  3385. B<NOTE:> the function C<lzb_compress()> will ignore this value and will always use the value C<2**16 - 1> as the maximum back-reference distance.
  3386. =head2 $LZ_MAX_CHAIN_LEN
  3387. The value of C<$LZ_MAX_CHAIN_LEN> controls the amount of recent positions to remember for each matched prefix. A larger value results in better compression, finding longer matches, at the expense of speed.
  3388. By default, C<$LZ_MAX_CHAIN_LEN> is set to C<32>.
  3389. B<NOTE:> the function C<lzss_encode_fast()> will ignore this value, always using a value of C<1>.
  3390. =head1 HIGH-LEVEL FUNCTIONS
  3391. create_huffman_entry(\@symbols) # Create a Huffman Coding block
  3392. decode_huffman_entry($fh) # Decode a Huffman Coding block
  3393. create_ac_entry(\@symbols) # Create an Arithmetic Coding block
  3394. decode_ac_entry($fh) # Decode an Arithmetic Coding block
  3395. create_adaptive_ac_entry(\@symbols) # Create an Adaptive Arithmetic Coding block
  3396. decode_adaptive_ac_entry($fh) # Decode an Adaptive Arithmetic Coding block
  3397. mrl_compress($string) # MRL compression (MTF+ZRLE+RLE4+Huffman coding)
  3398. mrl_decompress($fh) # Inverse of the above method
  3399. mrl_compress_symbolic(\@symbols) # Symbolic MRL compression (MTF+ZRLE+RLE4+Huffman coding)
  3400. mrl_decompress_symbolic($fh) # Inverse of the above method
  3401. bwt_compress($string) # BWT-based compression (RLE4+BWT+MTF+ZRLE+Huffman coding)
  3402. bwt_decompress($fh) # Inverse of the above method
  3403. bwt_compress_symbolic(\@symbols) # Symbolic BWT-based compression (RLE4+sBWT+MTF+ZRLE+Huffman coding)
  3404. bwt_decompress_symbolic($fh) # Inverse of the above method
  3405. bzip2_compress($string) # Compress a given string using the Bzip2 format
  3406. bzip2_decompress($fh) # Inverse of the above method
  3407. gzip_compress($string) # Compress a given string using the Gzip format
  3408. gzip_decompress($fh) # Inverse of the above method
  3409. lzss_compress($string) # LZSS + DEFLATE-like encoding of lengths and distances
  3410. lzss_decompress($fh) # Inverse of the above method
  3411. lzss_compress_symbolic(\@symbols) # Symbolic LZSS + DEFLATE-like encoding of lengths and distances
  3412. lzss_decompress_symbolic($fh) # Inverse of the above method
  3413. lz77_compress($string) # LZ77 + Huffman coding of lengths and literals + OBH for distances
  3414. lz77_decompress($fh) # Inverse of the above method
  3415. lz77_compress_symbolic(\@symbols) # Symbolic LZ77 + Huffman coding of lengths and literals + OBH for distances
  3416. lz77_decompress_symbolic($fh) # Inverse of the above method
  3417. lzb_compress($string) # LZSS compression, using a byte-aligned encoding method, similar to LZ4
  3418. lzb_decompress($fh) # Inverse of the above method
  3419. lzw_compress($string) # LZW + abc_encode() compression
  3420. lzw_decompress($fh) # Inverse of the above method
  3421. lz4_compress($string) # Compress a given string using the LZ4 frame format
  3422. lz4_decompress($fh) # Inverse of the above method
  3423. =head1 MEDIUM-LEVEL FUNCTIONS
  3424. deltas(\@ints) # Computes the differences between integers
  3425. accumulate(\@deltas) # Inverse of the above method
  3426. delta_encode(\@ints) # Delta+RLE encoding of an array-ref of integers
  3427. delta_decode($fh) # Inverse of the above method
  3428. fibonacci_encode(\@symbols) # Fibonacci coding of an array-ref of symbols
  3429. fibonacci_decode($fh) # Inverse of the above method
  3430. elias_gamma_encode(\@symbols) # Elias Gamma coding method of an array-ref of symbols
  3431. elias_gamma_decode($fh) # Inverse of the above method
  3432. elias_omega_encode(\@symbols) # Elias Omega coding method of an array-ref of symbols
  3433. elias_omega_decode($fh) # Inverse of the above method
  3434. abc_encode(\@symbols) # Adaptive Binary Concatenation method of an array-ref of symbols
  3435. abc_decode($fh) # Inverse of the above method
  3436. obh_encode(\@symbols) # Offset bits + Huffman coding of an array-ref of symbols
  3437. obh_decode($fh) # Inverse of the above method
  3438. bwt_encode($string) # Burrows-Wheeler transform
  3439. bwt_decode($bwt, $idx) # Inverse of Burrows-Wheeler transform
  3440. bwt_encode_symbolic(\@symbols) # Burrows-Wheeler transform over an array-ref of symbols
  3441. bwt_decode_symbolic(\@bwt, $idx) # Inverse of symbolic Burrows-Wheeler transform
  3442. mtf_encode(\@symbols) # Move-to-front transform
  3443. mtf_decode(\@mtf, \@alphabet) # Inverse of the above method
  3444. encode_alphabet(\@alphabet) # Encode an alphabet of symbols into a binary string
  3445. decode_alphabet($fh) # Inverse of the above method
  3446. encode_alphabet_256(\@alphabet) # Encode an alphabet of symbols (limited to [0..255]) into a binary string
  3447. decode_alphabet_256($fh) # Inverse of the above method
  3448. frequencies(\@symbols) # Returns a dictionary with symbol frequencies
  3449. run_length(\@symbols, $max=undef) # Run-length encoding, returning a 2D array-ref
  3450. rle4_encode(\@symbols, $max=255) # Run-length encoding with 4 or more consecutive characters
  3451. rle4_decode(\@rle4) # Inverse of the above method
  3452. zrle_encode(\@symbols) # Run-length encoding of zeros
  3453. zrle_decode(\@zrle) # Inverse of the above method
  3454. ac_encode(\@symbols) # Arithmetic Coding applied on an array-ref of symbols
  3455. ac_decode($bitstring, \%freq) # Inverse of the above method
  3456. adaptive_ac_encode(\@symbols) # Adaptive Arithmetic Coding applied on an array-ref of symbols
  3457. adaptive_ac_decode($bitstring, \@alphabet) # Inverse of the above method
  3458. lzw_encode($string) # LZW encoding of a given string
  3459. lzw_decode(\@symbols) # Inverse of the above method
  3460. =head1 LOW-LEVEL FUNCTIONS
  3461. crc32($string, $prev_crc = 0) # Compute the CRC32 value of a given string
  3462. read_bit($fh, \$buffer) # Read one bit from file-handle (MSB)
  3463. read_bit_lsb($fh, \$buffer) # Read one bit from file-handle (LSB)
  3464. read_bits($fh, $len) # Read `$len` bits from file-handle (MSB)
  3465. read_bits_lsb($fh, $len) # Read `$len` bits from file-handle (LSB)
  3466. int2bits($symbol, $size) # Convert an integer to bits of width `$size` (MSB)
  3467. int2bits_lsb($symbol, $size) # Convert an integer to bits of width `$size` (LSB)
  3468. bits2int($fh, $size, \$buffer) # Inverse of `int2bits()`
  3469. bits2int_lsb($fh, $size, \$buffer) # Inverse of `int2bits_lsb()`
  3470. bytes2int($fh, $n) # Read `$n` bytes from file-handle as an integer (MSB)
  3471. bytes2int_lsb($fh, $n) # Read `$n` bytes from file-handle as an integer (LSB)
  3472. int2bytes($symbol, $size) # Convert an integer into `$size` bytes. (MSB)
  3473. int2bytes_lsb($symbol, $size) # Convert an integer into `$size` bytes. (LSB)
  3474. string2symbols($string) # Returns an array-ref of code points
  3475. symbols2string(\@symbols) # Returns a string, given an array-ref of code points
  3476. read_null_terminated($fh) # Read a binary string that ends with NULL ("\0")
  3477. binary_vrl_encode($bitstring) # Binary variable run-length encoding
  3478. binary_vrl_decode($bitstring) # Binary variable run-length decoding
  3479. bwt_sort($string) # Burrows-Wheeler sorting
  3480. bwt_sort_symbolic(\@symbols) # Burrows-Wheeler sorting, applied on an array-ref of symbols
  3481. huffman_encode(\@symbols, \%dict) # Huffman encoding
  3482. huffman_decode($bitstring, \%dict) # Huffman decoding, given a string of bits
  3483. huffman_from_freq(\%freq) # Create Huffman dictionaries, given an hash-ref of frequencies
  3484. huffman_from_symbols(\@symbols) # Create Huffman dictionaries, given an array-ref of symbols
  3485. huffman_from_code_lengths(\@lens) # Create canonical Huffman codes, given an array-ref of code lengths
  3486. make_deflate_tables($max_dist, $max_len) # Returns the DEFLATE tables for distance and length symbols
  3487. find_deflate_index($value, \@table) # Returns the index in a DEFLATE table, given a numerical value
  3488. lzss_encode($string) # LZSS encoding into literals, distances and lengths
  3489. lzss_encode_symbolic(\@symbols) # LZSS encoding into literals, distances and lengths (symbolic)
  3490. lzss_encode_fast($string) # Fast-LZSS encoding into literals, distances and lengths
  3491. lzss_encode_fast_symbolic(\@symbols) # Fast-LZSS encoding into literals, distances and lengths (symbolic)
  3492. lzss_decode(\@lits, \@dist, \@lens) # Inverse of lzss_encode() and lzss_encode_fast()
  3493. lzss_decode_symbolic(\@lits, \@dist, \@lens) # Inverse of lzss_encode_symbolic() and lzss_encode_fast_symbolic()
  3494. lz77_encode($string) # LZ77 encoding into literals, distances, lengths and matches
  3495. lz77_encode_symbolic(\@symbols) # LZ77 encoding into literals, distances, lengths and matches (symbolic)
  3496. lz77_decode(\@lits, \@dist, \@lens, \@matches) # Inverse of lz77_encode()
  3497. lz77_decode_symbolic(\@lits, \@dist, \@lens, \@matches) # Inverse of lz77_encode_symbolic()
  3498. deflate_encode(\@lits, \@dist, \@lens) # DEFLATE-like encoding of values returned by lzss_encode()
  3499. deflate_decode($fh) # Inverse of the above method
  3500. =head1 INTERFACE FOR HIGH-LEVEL FUNCTIONS
  3501. =head2 create_huffman_entry
  3502. my $string = create_huffman_entry(\@symbols);
  3503. High-level function that generates a Huffman coding block, given an array-ref of symbols.
  3504. =head2 decode_huffman_entry
  3505. my $symbols = decode_huffman_entry($fh);
  3506. my $symbols = decode_huffman_entry($string);
  3507. Inverse of C<create_huffman_entry()>.
  3508. =head2 create_ac_entry
  3509. my $string = create_ac_entry(\@symbols);
  3510. High-level function that generates an Arithmetic Coding block, given an array-ref of symbols.
  3511. =head2 decode_ac_entry
  3512. my $symbols = decode_ac_entry($fh);
  3513. my $symbols = decode_ac_entry($string);
  3514. Inverse of C<create_ac_entry()>.
  3515. =head2 create_adaptive_ac_entry
  3516. my $string = create_adaptive_ac_entry(\@symbols);
  3517. High-level function that generates an Adaptive Arithmetic Coding block, given an array-ref of symbols.
  3518. =head2 decode_adaptive_ac_entry
  3519. my $symbols = decode_adaptive_ac_entry($fh);
  3520. my $symbols = decode_adaptive_ac_entry($string);
  3521. Inverse of C<create_adaptive_ac_entry()>.
  3522. =head2 lz77_compress / lz77_compress_symbolic
  3523. # With Huffman coding
  3524. my $string = lz77_compress($data);
  3525. my $string = lz77_compress(\@symbols);
  3526. # With Arithmetic Coding
  3527. my $string = lz77_compress($data, \&create_ac_entry);
  3528. # Using Fast-LZSS parsing + Huffman coding
  3529. my $string = lz77_compress($data, \&create_huffman_entry, \&lzss_encode_fast);
  3530. High-level function that performs LZ77 compression on the provided data, using the pipeline:
  3531. 1. lz77_encode
  3532. 2. create_huffman_entry(literals)
  3533. 3. create_huffman_entry(lengths)
  3534. 4. create_huffman_entry(matches)
  3535. 5. obh_encode(distances)
  3536. The function accepts either a string or an array-ref of symbols as the first argument.
  3537. =head2 lz77_decompress / lz77_decompress_symbolic
  3538. # With Huffman coding
  3539. my $data = lz77_decompress($fh);
  3540. my $data = lz77_decompress($string);
  3541. # With Arithemtic coding
  3542. my $data = lz77_decompress($fh, \&decode_ac_entry);
  3543. my $data = lz77_decompress($string, \&decode_ac_entry);
  3544. # Symbolic, with Huffman coding
  3545. my $symbols = lz77_decompress_symbolic($fh);
  3546. my $symbols = lz77_decompress_symbolic($string);
  3547. Inverse of C<lz77_compress()> and C<lz77_compress_symbolic()>, respectively.
  3548. =head2 lzss_compress / lzss_compress_symbolic
  3549. # With Huffman coding
  3550. my $string = lzss_compress($data);
  3551. my $string = lzss_compress(\@symbols);
  3552. # With Arithmetic Coding
  3553. my $string = lzss_compress($data, \&create_ac_entry);
  3554. # Using Fast-LZSS parsing + Huffman coding
  3555. my $string = lzss_compress($data, \&create_huffman_entry, \&lzss_encode_fast);
  3556. High-level function that performs LZSS (Lempel-Ziv-Storer-Szymanski) compression on the provided data, using the pipeline:
  3557. 1. lzss_encode
  3558. 2. deflate_encode
  3559. The function accepts either a string or an array-ref of symbols as the first argument.
  3560. =head2 lzss_decompress / lzss_decompress_symbolic
  3561. # With Huffman coding
  3562. my $data = lzss_decompress($fh);
  3563. my $data = lzss_decompress($string);
  3564. # With Arithmetic coding
  3565. my $data = lzss_decompress($fh, \&decode_ac_entry);
  3566. my $data = lzss_decompress($string, \&decode_ac_entry);
  3567. # Symbolic, with Huffman coding
  3568. my $symbols = lzss_decompress_symbolic($fh);
  3569. my $symbols = lzss_decompress_symbolic($string);
  3570. Inverse of C<lzss_compress()> and C<lzss_compress_symbolic()>, respectively.
  3571. =head2 lzb_compress
  3572. my $string = lzb_compress($data);
  3573. my $string = lzb_compress($data, \&lzss_encode_fast); # with fast-LZ parsing
  3574. High-level function that performs byte-oriented LZSS compression, inspired by LZ4.
  3575. =head2 lzb_decompress
  3576. my $data = lzb_decompress($fh);
  3577. my $data = lzb_decompress($string);
  3578. Inverse of C<lzb_compress()>.
  3579. =head2 lz4_compress
  3580. my $string = lz4_compress($fh);
  3581. my $string = lz4_compress($data);
  3582. my $string = lz4_compress($data, \&lzss_encode_fast); # with fast-LZ parsing
  3583. Valid LZ4 compressor, using the LZ4 Frame format, given either a string or an input file-handle.
  3584. The input data is split into chunks of length C<2**17> and compressed into independent LZ4 blocks.
  3585. =head2 lz4_decompress
  3586. my $data = lz4_decompress($fh);
  3587. my $data = lz4_decompress($string);
  3588. Decompress LZ4 Frame data, given either a string or an input file-handle. Concatenated LZ4 Frames are also supported.
  3589. =head2 lzw_compress
  3590. my $string = lzw_compress($data);
  3591. High-level function that performs LZW (Lempel-Ziv-Welch) compression on the provided data, using the pipeline:
  3592. 1. lzw_encode
  3593. 2. abc_encode
  3594. =head2 lzw_decompress
  3595. my $data = lzw_decompress($fh);
  3596. my $data = lzw_decompress($string);
  3597. Performs Lempel-Ziv-Welch (LZW) decompression on the provided string or file-handle. Inverse of C<lzw_compress()>.
  3598. =head2 bwt_compress
  3599. # Using Huffman Coding
  3600. my $string = bwt_compress($data);
  3601. # Using Arithmetic Coding
  3602. my $string = bwt_compress($data, \&create_ac_entry);
  3603. High-level function that performs BWT-based compression on the provided data, using the pipeline:
  3604. 1. rle4_encode
  3605. 2. bwt_encode
  3606. 3. mtf_encode
  3607. 4. zrle_encode
  3608. 5. create_huffman_entry
  3609. =head2 bwt_decompress
  3610. # With Huffman coding
  3611. my $data = bwt_decompress($fh);
  3612. my $data = bwt_decompress($string);
  3613. # With Arithmetic coding
  3614. my $data = bwt_decompress($fh, \&decode_ac_entry);
  3615. my $data = bwt_decompress($string, \&decode_ac_entry);
  3616. Inverse of C<bwt_compress()>.
  3617. =head2 bwt_compress_symbolic
  3618. # Does Huffman coding
  3619. my $string = bwt_compress_symbolic(\@symbols);
  3620. # Does Arithmetic coding
  3621. my $string = bwt_compress_symbolic(\@symbols, \&create_ac_entry);
  3622. Similar to C<bwt_compress()>, except that it accepts an arbitrary array-ref of non-negative integer values as input. It is also a bit slower on large inputs.
  3623. =head2 bwt_decompress_symbolic
  3624. # Using Huffman coding
  3625. my $symbols = bwt_decompress_symbolic($fh);
  3626. my $symbols = bwt_decompress_symbolic($string);
  3627. # Using Arithmetic coding
  3628. my $symbols = bwt_decompress_symbolic($fh, \&decode_ac_entry);
  3629. my $symbols = bwt_decompress_symbolic($string, \&decode_ac_entry);
  3630. Inverse of C<bwt_compress_symbolic()>.
  3631. =head2 bzip2_compress
  3632. my $string = bzip2_compress($data);
  3633. my $string = bzip2_compress($fh);
  3634. Valid Bzip2 compressor, given a string or an input file-handle.
  3635. =head2 bzip2_decompress
  3636. my $data = bzip2_decompress($string);
  3637. my $data = bzip2_decompress($fh);
  3638. Valid Bzip2 decompressor, given a string or an input file-handle.
  3639. =head2 gzip_compress
  3640. my $string = gzip_compress($fh);
  3641. my $string = gzip_compress($data);
  3642. my $string = gzip_compress($data, \&lzss_encode_fast); # using fast LZ-parsing
  3643. Valid Gzip compressor, given a string or an input file-handle.
  3644. =head2 gzip_decompress
  3645. my $data = gzip_decompress($string);
  3646. my $data = gzip_decompress($fh);
  3647. Valid Bzip2 decompressor, given a string or an input file-handle.
  3648. =head2 mrl_compress / mrl_compress_symbolic
  3649. # Does Huffman coding
  3650. my $enc = mrl_compress($str);
  3651. my $enc = mrl_compress(\@symbols);
  3652. # Does Arithmetic coding
  3653. my $enc = mrl_compress($str, \&create_ac_entry);
  3654. my $enc = mrl_compress(\@symbols, \&create_ac_entry);
  3655. A fast compression method, using the following pipeline:
  3656. 1. mtf_encode
  3657. 2. zrle_encode
  3658. 3. rle4_encode
  3659. 4. create_huffman_entry
  3660. It accepts an arbitrary array-ref of non-negative integer values as input.
  3661. =head2 mrl_decompress / mrl_decompress_symbolic
  3662. # With Huffman coding
  3663. my $data = mrl_decompress($fh);
  3664. my $data = mrl_decompress($string);
  3665. # Symbolic, with Huffman coding
  3666. my $symbols = mrl_decompress_symbolic($fh);
  3667. my $symbols = mrl_decompress_symbolic($string);
  3668. # Symbolic, with Arithmetic coding
  3669. my $symbols = mrl_decompress_symbolic($fh, \&decode_ac_entry);
  3670. my $symbols = mrl_decompress_symbolic($string, \&decode_ac_entry);
  3671. Inverse of C<mrl_decompress()> and C<mrl_compress_symbolic()>.
  3672. =head1 INTERFACE FOR MEDIUM-LEVEL FUNCTIONS
  3673. =head2 frequencies
  3674. my $freq = frequencies(\@symbols);
  3675. Returns an hash ref dictionary with frequencies, given an array-ref of symbols.
  3676. =head2 deltas
  3677. my $deltas = deltas(\@integers);
  3678. Computes the differences between consecutive integers, returning an array.
  3679. =head2 accumulate
  3680. my $integers = accumulate(\@deltas);
  3681. Inverse of C<deltas()>.
  3682. =head2 delta_encode
  3683. my $string = delta_encode(\@integers);
  3684. Encodes a sequence of integers (including negative integers) using Delta + Run-length + Elias omega coding, returning a binary string.
  3685. Delta encoding calculates the difference between consecutive integers in the sequence and encodes these differences using Elias omega coding. When it's beneficial, runs of identical symbols are collapsed with RLE.
  3686. This method supports both positive and negative integers.
  3687. =head2 delta_decode
  3688. # Given a file-handle
  3689. my $integers = delta_decode($fh);
  3690. # Given a string
  3691. my $integers = delta_decode($string);
  3692. Inverse of C<delta_encode()>.
  3693. =head2 fibonacci_encode
  3694. my $string = fibonacci_encode(\@symbols);
  3695. Encodes a sequence of non-negative integers using Fibonacci coding, returning a binary string.
  3696. =head2 fibonacci_decode
  3697. # Given a file-handle
  3698. my $symbols = fibonacci_decode($fh);
  3699. # Given a binary string
  3700. my $symbols = fibonacci_decode($string);
  3701. Inverse of C<fibonacci_encode()>.
  3702. =head2 elias_gamma_encode
  3703. my $string = elias_gamma_encode(\@symbols);
  3704. Encodes a sequence of non-negative integers using Elias Gamma coding, returning a binary string.
  3705. =head2 elias_gamma_decode
  3706. # Given a file-handle
  3707. my $symbols = elias_gamma_decode($fh);
  3708. # Given a binary string
  3709. my $symbols = elias_gamma_decode($string);
  3710. Inverse of C<elias_gamma_encode()>.
  3711. =head2 elias_omega_encode
  3712. my $string = elias_omega_encode(\@symbols);
  3713. Encodes a sequence of non-negative integers using Elias Omega coding, returning a binary string.
  3714. =head2 elias_omega_decode
  3715. # Given a file-handle
  3716. my $symbols = elias_omega_decode($fh);
  3717. # Given a binary string
  3718. my $symbols = elias_omega_decode($string);
  3719. Inverse of C<elias_omega_encode()>.
  3720. =head2 abc_encode
  3721. my $string = abc_encode(\@symbols);
  3722. Encodes a sequence of non-negative integers using the Adaptive Binary Concatenation encoding method.
  3723. This method is particularly effective in encoding a sequence of integers that are in ascending order or have roughly the same size in binary.
  3724. =head2 abc_decode
  3725. # Given a file-handle
  3726. my $symbols = abc_decode($fh);
  3727. # Given a binary string
  3728. my $symbols = abc_decode($string);
  3729. Inverse of C<abc_encode()>.
  3730. =head2 obh_encode
  3731. # With Huffman Coding
  3732. my $string = obh_encode(\@symbols);
  3733. # With Arithmetic Coding
  3734. my $string = obh_encode(\@symbols, \&create_ac_entry);
  3735. Encodes a sequence of non-negative integers using offset bits and Huffman coding.
  3736. This method is particularly effective in encoding a sequence of moderately large random integers, such as the list of distances returned by C<lzss_encode()>.
  3737. =head2 obh_decode
  3738. # Given a file-handle
  3739. my $symbols = obh_decode($fh); # Huffman decoding
  3740. my $symbols = obh_decode($fh, \&decode_ac_entry); # Arithmetic decoding
  3741. # Given a binary string
  3742. my $symbols = obh_decode($string); # Huffman decoding
  3743. my $symbols = obh_decode($string, \&decode_ac_entry); # Arithmetic decoding
  3744. Inverse of C<obh_encode()>.
  3745. =head2 bwt_encode
  3746. my ($bwt, $idx) = bwt_encode($string);
  3747. my ($bwt, $idx) = bwt_encode($string, $lookahead_len);
  3748. Applies the Burrows-Wheeler Transform (BWT) to a given string.
  3749. =head2 bwt_decode
  3750. my $string = bwt_decode($bwt, $idx);
  3751. Reverses the Burrows-Wheeler Transform (BWT) applied to a string.
  3752. The function returns the original string.
  3753. =head2 bwt_encode_symbolic
  3754. my ($bwt_symbols, $idx) = bwt_encode_symbolic(\@symbols);
  3755. Applies the Burrows-Wheeler Transform (BWT) to a sequence of symbolic elements.
  3756. =head2 bwt_decode_symbolic
  3757. my $symbols = bwt_decode_symbolic(\@bwt_symbols, $idx);
  3758. Reverses the Burrows-Wheeler Transform (BWT) applied to a sequence of symbolic elements.
  3759. =head2 mtf_encode
  3760. my $mtf = mtf_encode(\@symbols, \@alphabet);
  3761. my ($mtf, $alphabet) = mtf_encode(\@symbols);
  3762. Performs Move-To-Front (MTF) encoding on a sequence of symbols.
  3763. The function returns the encoded MTF sequence and the sorted list of unique symbols in the input data, representing the alphabet.
  3764. Optionally, the alphabet can be provided as a second argument. When two arguments are provided, only the MTF sequence is returned.
  3765. =head2 mtf_decode
  3766. my $symbols = mtf_decode(\@mtf, \@alphabet);
  3767. Inverse of C<mtf_encode()>.
  3768. =head2 encode_alphabet / encode_alphabet_256
  3769. my $string = encode_alphabet(\@alphabet); # supports arbitrarily large symbols
  3770. my $string = encode_alphabet_256(\@alphabet); # limited to symbols [0..255]
  3771. Encode a sorted alphabet of symbols into a binary string.
  3772. =head2 decode_alphabet / decode_alphabet_256
  3773. my $alphabet = decode_alphabet($fh);
  3774. my $alphabet = decode_alphabet($string);
  3775. my $alphabet = decode_alphabet_256($fh);
  3776. my $alphabet = decode_alphabet_256($string);
  3777. Decodes an encoded alphabet, given a file-handle or a binary string, returning an array-ref of symbols. Inverse of C<encode_alphabet()>.
  3778. =head2 run_length
  3779. my $rl = run_length(\@symbols);
  3780. my $rl = run_length(\@symbols, $max_run);
  3781. Performs Run-Length Encoding (RLE) on a sequence of symbolic elements.
  3782. It takes two parameters: C<\@symbols>, representing an array of symbols, and C<$max_run>, indicating the maximum run length allowed.
  3783. The function returns a 2D-array, with pairs: C<[symbol, run_length]>, such that the following code reconstructs the C<\@symbols> array:
  3784. my @symbols = map { ($_->[0]) x $_->[1] } @$rl;
  3785. By default, the maximum run-length is unlimited.
  3786. =head2 rle4_encode
  3787. my $rle4 = rle4_encode($string);
  3788. my $rle4 = rle4_encode(\@symbols);
  3789. my $rle4 = rle4_encode(\@symbols, $max_run);
  3790. Performs Run-Length Encoding (RLE) on a sequence of symbolic elements, specifically designed for runs of four or more consecutive symbols.
  3791. It takes two parameters: C<\@symbols>, representing an array of symbols, and C<$max_run>, indicating the maximum run length allowed during encoding.
  3792. The function returns the encoded RLE sequence as an array-ref of symbols.
  3793. By default, the maximum run-length is limited to C<255>.
  3794. =head2 rle4_decode
  3795. my $symbols = rle4_decode(\@rle4);
  3796. my $symbols = rle4_decode($rle4_string);
  3797. Inverse of C<rle4_encode()>.
  3798. =head2 zrle_encode
  3799. my $zrle = zrle_encode(\@symbols);
  3800. Performs Zero-Run-Length Encoding (ZRLE) on a sequence of symbolic elements, returning the encoded ZRLE sequence as an array-ref of symbols.
  3801. This function efficiently encodes runs of zeros, but also increments each symbol by C<1>.
  3802. =head2 zrle_decode
  3803. my $symbols = zrle_decode($zrle);
  3804. Inverse of C<zrle_encode()>.
  3805. =head2 ac_encode
  3806. my ($bitstring, $freq) = ac_encode(\@symbols);
  3807. Performs Arithmetic Coding on the provided symbols.
  3808. It takes a single parameter, C<\@symbols>, representing the symbols to be encoded.
  3809. The function returns two values: C<$bitstring>, which is a string of 1s and 0s, and C<$freq>, representing the frequency table used for encoding.
  3810. =head2 ac_decode
  3811. my $symbols = ac_decode($bits_fh, \%freq);
  3812. my $symbols = ac_decode($bitstring, \%freq);
  3813. Performs Arithmetic Coding decoding using the provided frequency table and a string of 1s and 0s. Inverse of C<ac_encode()>.
  3814. It takes two parameters: C<$bitstring>, representing a string of 1s and 0s containing the arithmetic coded data, and C<\%freq>, representing the frequency table used for encoding.
  3815. The function returns the decoded sequence of symbols.
  3816. =head2 adaptive_ac_encode
  3817. my ($bitstring, $alphabet) = adaptive_ac_encode(\@symbols);
  3818. Performs Adaptive Arithmetic Coding on the provided symbols.
  3819. It takes a single parameter, C<\@symbols>, representing the symbols to be encoded.
  3820. The function returns two values: C<$bitstring>, which is a string of 1s and 0s, and C<$alphabet>, which is an array-ref of distinct sorted symbols.
  3821. =head2 adaptive_ac_decode
  3822. my $symbols = adaptive_ac_decode($bits_fh, \@alphabet);
  3823. my $symbols = adaptive_ac_decode($bitstring, \@alphabet);
  3824. Performs Adaptive Arithmetic Coding decoding using the provided frequency table and a string of 1s and 0s.
  3825. It takes two parameters: C<$bitstring>, representing a string of 1s and 0s containing the adaptive arithmetic coded data, and C<\@alphabet>, representing the array of distinct sorted symbols that appear in the encoded data.
  3826. The function returns the decoded sequence of symbols.
  3827. =head2 lzw_encode
  3828. my $symbols = lzw_encode($string);
  3829. Performs Lempel-Ziv-Welch (LZW) encoding on the provided string.
  3830. It takes a single parameter, C<$string>, representing the data to be encoded.
  3831. The function returns an array-ref of symbols.
  3832. =head2 lzw_decode
  3833. my $string = lzw_decode(\@symbols);
  3834. Performs Lempel-Ziv-Welch (LZW) decoding on the provided symbols. Inverse of C<lzw_encode()>.
  3835. The function returns the decoded string.
  3836. =head1 INTERFACE FOR LOW-LEVEL FUNCTIONS
  3837. =head2 crc32
  3838. my $int32 = crc32($data);
  3839. my $int32 = crc32($data, $prev_crc32);
  3840. Compute the CRC32 of a given string.
  3841. =head2 read_bit
  3842. my $bit = read_bit($fh, \$buffer);
  3843. Reads a single bit from a file-handle C<$fh> (MSB order).
  3844. The function stores the extra bits inside the C<$buffer>, reading one character at a time from the file-handle.
  3845. =head2 read_bit_lsb
  3846. my $bit = read_bit_lsb($fh, \$buffer);
  3847. Reads a single bit from a file-handle C<$fh> (LSB order).
  3848. The function stores the extra bits inside the C<$buffer>, reading one character at a time from the file-handle.
  3849. =head2 read_bits
  3850. my $bitstring = read_bits($fh, $bits_len);
  3851. Reads a specified number of bits (C<$bits_len>) from a file-handle (C<$fh>) and returns them as a string, in MSB order.
  3852. =head2 read_bits_lsb
  3853. my $bitstring = read_bits_lsb($fh, $bits_len);
  3854. Reads a specified number of bits (C<$bits_len>) from a file-handle (C<$fh>) and returns them as a string, in LSB order.
  3855. =head2 int2bits
  3856. my $bitstring = int2bits($symbol, $size)
  3857. Convert a non-negative integer to a bitstring of width C<$size>, in MSB order.
  3858. =head2 int2bits_lsb
  3859. my $bitstring = int2bits_lsb($symbol, $size)
  3860. Convert a non-negative integer to a bitstring of width C<$size>, in LSB order.
  3861. =head2 int2bytes
  3862. my $string = int2bytes($symbol, $size);
  3863. Convert a non-negative integer to a byte-string of width C<$size>, in MSB order.
  3864. =head2 int2bytes_lsb
  3865. my $string = int2bytes_lsb($symbol, $size);
  3866. Convert a non-negative integer to a byte-string of width C<$size>, in LSB order.
  3867. =head2 bits2int
  3868. my $integer = bits2int($fh, $size, \$buffer);
  3869. Read C<$size> bits from a file-handle C<$fh> and convert them to an integer, in MSB order. Inverse of C<int2bits()>.
  3870. The function stores the extra bits inside the C<$buffer>, reading one character at a time from the file-handle.
  3871. =head2 bits2int_lsb
  3872. my $integer = bits2int_lsb($fh, $size, \$buffer);
  3873. Read C<$size> bits from a file-handle C<$fh> and convert them to an integer, in LSB order. Inverse of C<int2bits_lsb()>.
  3874. The function stores the extra bits inside the C<$buffer>, reading one character at a time from the file-handle.
  3875. =head2 bytes2int
  3876. my $integer = bytes2int($fh, $n);
  3877. my $integer = bytes2int($str, $n);
  3878. Read C<$n> bytes from a file-handle C<$fh> or from a string C<$str> and convert them to an integer, in MSB order.
  3879. =head2 bytes2int_lsb
  3880. my $integer = bytes2int_lsb($fh, $n);
  3881. my $integer = bytes2int_lsb($str, $n);
  3882. Read C<$n> bytes from a file-handle C<$fh> or from a string C<$str> and convert them to an integer, in LSB order.
  3883. =head2 string2symbols
  3884. my $symbols = string2symbols($string)
  3885. Returns an array-ref of code points, given a string.
  3886. =head2 symbols2string
  3887. my $string = symbols2string(\@symbols)
  3888. Returns a string, given an array-ref of code points.
  3889. =head2 read_null_terminated
  3890. my $string = read_null_terminated($fh)
  3891. Read a string from file-handle C<$fh> that ends with a NULL character ("\0").
  3892. =head2 binary_vrl_encode
  3893. my $bitstring_enc = binary_vrl_encode($bitstring);
  3894. Given a string of 1s and 0s, returns back a bitstring of 1s and 0s encoded using variable run-length encoding.
  3895. =head2 binary_vrl_decode
  3896. my $bitstring = binary_vrl_decode($bitstring_enc);
  3897. Given an encoded bitstring, returned by C<binary_vrl_encode()>, gives back the decoded string of 1s and 0s.
  3898. =head2 bwt_sort
  3899. my $indices = bwt_sort($string);
  3900. my $indices = bwt_sort($string, $lookahead_len);
  3901. Low-level function that sorts the rotations of a given string using the Burrows-Wheeler Transform (BWT) algorithm.
  3902. It takes two parameters: C<$string>, which is the input string to be transformed, and C<$LOOKAHEAD_LEN> (optional), representing the length of look-ahead during sorting.
  3903. The function returns an array-ref of indices.
  3904. There is probably no need to call this function explicitly. Use C<bwt_encode()> instead!
  3905. =head2 bwt_sort_symbolic
  3906. my $indices = bwt_sort_symbolic(\@symbols);
  3907. Low-level function that sorts the rotations of a sequence of symbolic elements using the Burrows-Wheeler Transform (BWT) algorithm.
  3908. It takes a single parameter C<\@symbols>, which represents the input sequence of symbolic elements. The function returns an array of indices.
  3909. There is probably no need to call this function explicitly. Use C<bwt_encode_symbolic()> instead!
  3910. =head2 huffman_from_freq
  3911. my $dict = huffman_from_freq(\%freq);
  3912. my ($dict, $rev_dict) = huffman_from_freq(\%freq);
  3913. Low-level function that constructs Huffman prefix codes, based on the frequency of symbols provided in a hash table.
  3914. It takes a single parameter, C<\%freq>, representing the hash table where keys are symbols, and values are their corresponding frequencies.
  3915. The function returns two values: C<$dict>, which is the mapping of symbols to Huffman codes, and C<$rev_dict>, which holds the reverse mapping of Huffman codes to symbols.
  3916. The prefix codes are in canonical form, as defined in RFC 1951 (Section 3.2.2).
  3917. =head2 huffman_from_symbols
  3918. my $dict = huffman_from_symbols(\@symbols);
  3919. my ($dict, $rev_dict) = huffman_from_symbols(\@symbols);
  3920. Low-level function that constructs Huffman prefix codes, given an array-ref of symbols.
  3921. It takes a single parameter, C<\@symbols>, from which it computes the frequency of each symbol and generates the corresponding Huffman prefix codes.
  3922. The function returns two values: C<$dict>, which is the mapping of symbols to Huffman codes, and C<$rev_dict>, which holds the reverse mapping of Huffman codes to symbols.
  3923. The prefix codes are in canonical form, as defined in RFC 1951 (Section 3.2.2).
  3924. =head2 huffman_from_code_lengths
  3925. my $dict = huffman_from_code_lengths(\@code_lengths);
  3926. my ($dict, $rev_dict) = huffman_from_code_lengths(\@code_lengths);
  3927. Low-level function that constructs a dictionary of canonical prefix codes, given an array of code lengths, as defined in RFC 1951 (Section 3.2.2).
  3928. It takes a single parameter, C<\@code_lengths>, where entry C<$i> in the array corresponds to the code length for symbol C<$i>.
  3929. The function returns two values: C<$dict>, which is the mapping of symbols to Huffman codes, and C<$rev_dict>, which holds the reverse mapping of Huffman codes to symbols.
  3930. =head2 huffman_encode
  3931. my $bitstring = huffman_encode(\@symbols, $dict);
  3932. Low-level function that performs Huffman encoding on a sequence of symbols using a provided dictionary, returned by C<huffman_from_freq()>.
  3933. It takes two parameters: C<\@symbols>, representing the sequence of symbols to be encoded, and C<$dict>, representing the Huffman dictionary mapping symbols to their corresponding Huffman codes.
  3934. The function returns a concatenated string of 1s and 0s, representing the Huffman-encoded sequence of symbols.
  3935. =head2 huffman_decode
  3936. my $symbols = huffman_decode($bitstring, $rev_dict);
  3937. Low-level function that decodes a Huffman-encoded binary string into a sequence of symbols using a provided reverse dictionary.
  3938. It takes two parameters: C<$bitstring>, representing the Huffman-encoded string of 1s and 0s, as returned by C<huffman_encode()>, and C<$rev_dict>, representing the reverse dictionary mapping Huffman codes to their corresponding symbols.
  3939. The function returns the decoded sequence of symbols as an array-ref.
  3940. =head2 lz77_encode / lz77_encode_symbolic
  3941. my ($literals, $distances, $lengths, $matches) = lz77_encode($string);
  3942. my ($literals, $distances, $lengths, $matches) = lz77_encode(\@symbols);
  3943. Low-level function that combines LZSS with ideas from the LZ4 method.
  3944. The function returns four values:
  3945. $literals # array-ref of uncompressed symbols
  3946. $distances # array-ref of back-reference distances
  3947. $lengths # array-ref of literal lengths
  3948. $matches # array-ref of match lengths
  3949. The output can be decoded with C<lz77_decode()> and C<lz77_decode_symbolic()>, respectively.
  3950. =head2 lz77_decode / lz77_decode_symbolic
  3951. my $string = lz77_decode(\@literals, \@distances, \@lengths, \@matches);
  3952. my $symbols = lz77_decode_symbolic(\@literals, \@distances, \@lengths, \@matches);
  3953. Low-level function that performs decoding using the provided literals, distances, lengths and matches, returned by LZ77 encoding.
  3954. Inverse of C<lz77_encode()> and C<lz77_encode_symbolic()>, respectively.
  3955. =head2 lzss_encode / lzss_encode_fast / lzss_encode_symbolic / lzss_encode_fast_symbolic
  3956. # Standard version
  3957. my ($literals, $distances, $lengths) = lzss_encode($data, %params);
  3958. my ($literals, $distances, $lengths) = lzss_encode(\@symbols, %params);
  3959. # Faster version
  3960. my ($literals, $distances, $lengths) = lzss_encode_fast($data, %params);
  3961. my ($literals, $distances, $lengths) = lzss_encode_fast(\@symbols, %params);
  3962. Low-level function that applies the LZSS (Lempel-Ziv-Storer-Szymanski) algorithm on the provided data.
  3963. The accepted C<%params> are:
  3964. min_len => $LZ_MIN_LEN,
  3965. max_len => $LZ_MAX_LEN,
  3966. max_dist => $LZ_MAX_DIST,
  3967. max_chain_len => $LZ_MAX_CHAIN_LEN,
  3968. The function returns three values:
  3969. $literals # array-ref of uncompressed symbols
  3970. $distances # array-ref of back-reference distances
  3971. $lengths # array-ref of match lengths
  3972. The output can be decoded with C<lzss_decode()> and C<lzss_decode_symbolic()>, respectively.
  3973. =head2 lzss_decode / lzss_decode_symbolic
  3974. my $string = lzss_decode(\@literals, \@distances, \@lengths);
  3975. my $symbols = lzss_decode_symbolic(\@literals, \@distances, \@lengths);
  3976. Low-level function that decodes the LZSS encoding, using the provided literals, distances, and lengths of matched sub-strings.
  3977. Inverse of C<lzss_encode()> and C<lzss_encode_fast()>.
  3978. =head2 deflate_encode
  3979. # Returns a binary string
  3980. my $string = deflate_encode(\@literals, \@distances, \@lengths);
  3981. my $string = deflate_encode(\@literals, \@distances, \@lengths, \&create_ac_entry);
  3982. Low-level function that encodes the results returned by C<lzss_encode()> and C<lzss_encode_fast()>, using a DEFLATE-like approach, combined with Huffman coding.
  3983. =head2 deflate_decode
  3984. # Huffman decoding
  3985. my ($literals, $distances, $lengths) = deflate_decode($fh);
  3986. my ($literals, $distances, $lengths) = deflate_decode($string);
  3987. # Arithmetic decoding
  3988. my ($literals, $distances, $lengths) = deflate_decode($fh, \&decode_ac_entry);
  3989. my ($literals, $distances, $lengths) = deflate_decode($string, \&decode_ac_entry);
  3990. Inverse of C<deflate_encode()>.
  3991. =head2 make_deflate_tables
  3992. my ($DISTANCE_SYMBOLS, $LENGTH_SYMBOLS, $LENGTH_INDICES) = make_deflate_tables($max_dist, $max_len);
  3993. Low-level function that returns a list of tables used in encoding the relative back-reference distances and lengths returned by C<lzss_encode()> and C<lzss_encode_fast()>.
  3994. When no arguments are provided:
  3995. $max_dist = $Compression::Util::LZ_MAX_DIST
  3996. $max_len = $Compression::Util::LZ_MAX_LEN
  3997. There is no need to call this function explicitly. Use C<deflate_encode()> instead!
  3998. =head2 find_deflate_index
  3999. my $index = find_deflate_index($value, $DISTANCE_SYMBOLS);
  4000. Low-level function that returns the index inside the DEFLATE tables for a given value.
  4001. =head1 EXPORT
  4002. Each function can be exported individually, as:
  4003. use Compression::Util qw(bwt_compress);
  4004. By specifying the B<:all> keyword, will export all the exportable functions:
  4005. use Compression::Util qw(:all);
  4006. Nothing is exported by default.
  4007. =head1 EXAMPLES
  4008. The functions can be combined in various ways, easily creating novel compression methods, as illustrated in the following examples.
  4009. =head2 Combining LZSS + MRL compression:
  4010. my $enc = lzss_compress($str, \&mrl_compress_symbolic);
  4011. my $dec = lzss_decompress($enc, \&mrl_decompress_symbolic);
  4012. =head2 Combining LZ77 + OBH encoding:
  4013. my $enc = lz77_compress($str, \&obh_encode);
  4014. my $dec = lz77_decompress($enc, \&obh_decode);
  4015. =head2 Combining LZSS + symbolic BWT compression:
  4016. my $enc = lzss_compress($str, \&bwt_compress_symbolic);
  4017. my $dec = lzss_decompress($enc, \&bwt_decompress_symbolic);
  4018. =head2 Combining BWT + symbolic LZSS:
  4019. my $enc = bwt_compress($str, \&lzss_compress_symbolic);
  4020. my $dec = bwt_decompress($enc, \&lzss_decompress_symbolic);
  4021. =head2 Combining LZW + Fibonacci encoding:
  4022. my $enc = lzw_compress($str, \&fibonacci_encode);
  4023. my $dec = lzw_decompress($enc, \&fibonacci_decode);
  4024. =head2 Combining BWT + symbolic LZ77 + symbolic MRL:
  4025. my $enc = bwt_compress($str, sub ($s) { lz77_compress_symbolic($s, \&mrl_compress_symbolic) });
  4026. my $dec = bwt_decompress($enc, sub ($s) { lz77_decompress_symbolic($s, \&mrl_decompress_symbolic) });
  4027. =head2 Combining LZ77 + BWT compression + Fibonacci encoding + Huffman coding + OBH encoding + MRL compression:
  4028. # Compression
  4029. my $enc = do {
  4030. my ($literals, $distances, $lengths, $matches) = lz77_encode($str);
  4031. bwt_compress(symbols2string($literals))
  4032. . fibonacci_encode($lengths)
  4033. . create_huffman_entry($matches)
  4034. . obh_encode($distances, \&mrl_compress_symbolic);
  4035. };
  4036. # Decompression
  4037. my $dec = do {
  4038. open my $fh, '<:raw', \$enc;
  4039. my $literals = string2symbols(bwt_decompress($fh));
  4040. my $lengths = fibonacci_decode($fh);
  4041. my $matches = decode_huffman_entry($fh);
  4042. my $distances = obh_decode($fh, \&mrl_decompress_symbolic);
  4043. lz77_decode($literals, $distances, $lengths, $matches);
  4044. };
  4045. =head1 REFERENCES
  4046. =over 4
  4047. =item * DEFLATE Compressed Data Format Specification
  4048. L<https://datatracker.ietf.org/doc/html/rfc1951>
  4049. =item * GZIP file format specification
  4050. L<https://datatracker.ietf.org/doc/html/rfc1952>
  4051. =item * BZIP2 Format Specification, by Joe Tsai:
  4052. L<https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf>
  4053. =item * LZ4 Frame format
  4054. L<https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md>
  4055. =item * LZ4 Block format
  4056. L<https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md>
  4057. =item * Data Compression (Summer 2023) - Lecture 4 - The Unix 'compress' Program:
  4058. L<https://youtube.com/watch?v=1cJL9Va80Pk>
  4059. =item * Data Compression (Summer 2023) - Lecture 5 - Basic Techniques:
  4060. L<https://youtube.com/watch?v=TdFWb8mL5Gk>
  4061. =item * Data Compression (Summer 2023) - Lecture 11 - DEFLATE (gzip):
  4062. L<https://youtube.com/watch?v=SJPvNi4HrWQ>
  4063. =item * Data Compression (Summer 2023) - Lecture 12 - The Burrows-Wheeler Transform (BWT):
  4064. L<https://youtube.com/watch?v=rQ7wwh4HRZM>
  4065. =item * Data Compression (Summer 2023) - Lecture 13 - BZip2:
  4066. L<https://youtube.com/watch?v=cvoZbBZ3M2A>
  4067. =item * Data Compression (Summer 2023) - Lecture 15 - Infinite Precision in Finite Bits:
  4068. L<https://youtube.com/watch?v=EqKbT3QdtOI>
  4069. =item * Information Retrieval WS 17/18, Lecture 4: Compression, Codes, Entropy:
  4070. L<https://youtube.com/watch?v=A_F94FV21Ek>
  4071. =item * COMP526 7-5 SS7.4 Run length encoding:
  4072. L<https://youtube.com/watch?v=3jKLjmV1bL8>
  4073. =item * COMP526 Unit 7-6 2020-03-24 Compression - Move-to-front transform:
  4074. L<https://youtube.com/watch?v=Q2pinaj3i9Y>
  4075. =item * Basic arithmetic coder in C++:
  4076. L<https://github.com/billbird/arith32>
  4077. =back
  4078. =head1 REPOSITORY
  4079. =over 4
  4080. =item * GitHub: L<https://github.com/trizen/Compression-Util>
  4081. =back
  4082. =head1 BUGS AND LIMITATIONS
  4083. Please report any bugs or feature requests to: L<https://github.com/trizen/Compression-Util>.
  4084. =head1 AUTHOR
  4085. Daniel "Trizen" Șuteu C<< <trizen@cpan.org> >>
  4086. =head1 ACKNOWLEDGEMENTS
  4087. Special thanks to professor Bill Bird for the awesome YouTube lectures on data compression.
  4088. =head1 LICENSE
  4089. This library is free software; you can redistribute it and/or modify
  4090. it under the same terms as Perl itself, either Perl version 5.38.2 or,
  4091. at your option, any later version of Perl 5 you may have available.
  4092. =cut