123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529 |
- From 01010419a6499768563e7b2f3fd56cf16edda75e Mon Sep 17 00:00:00 2001
- From: rpm-build <rpm-build>
- Date: Mon, 4 Oct 2021 08:54:37 +0200
- Subject: [PATCH] coreutils-i18n.patch
- ---
- bootstrap.conf | 1 +
- configure.ac | 2 +
- lib/linebuffer.h | 8 +
- lib/mbfile.c | 3 +
- lib/mbfile.h | 255 ++++++++++++
- m4/mbfile.m4 | 14 +
- src/cut.c | 508 +++++++++++++++++++++--
- src/expand-common.c | 114 ++++++
- src/expand-common.h | 12 +
- src/expand.c | 90 +++-
- src/fold.c | 312 ++++++++++++--
- src/join.c | 359 ++++++++++++++--
- src/local.mk | 4 +-
- src/pr.c | 443 ++++++++++++++++++--
- src/sort.c | 792 +++++++++++++++++++++++++++++++++---
- src/unexpand.c | 103 ++++-
- src/uniq.c | 119 +++++-
- tests/Coreutils.pm | 3 +
- tests/expand/mb.sh | 183 +++++++++
- tests/i18n/sort.sh | 29 ++
- tests/local.mk | 4 +
- tests/misc/expand.pl | 42 ++
- tests/misc/fold.pl | 50 ++-
- tests/misc/join.pl | 50 +++
- tests/misc/sort-mb-tests.sh | 45 ++
- tests/misc/sort-merge.pl | 42 ++
- tests/misc/sort.pl | 40 +-
- tests/misc/unexpand.pl | 39 ++
- tests/misc/uniq.pl | 55 +++
- tests/pr/pr-tests.pl | 49 +++
- tests/unexpand/mb.sh | 172 ++++++++
- 31 files changed, 3700 insertions(+), 242 deletions(-)
- create mode 100644 lib/mbfile.c
- create mode 100644 lib/mbfile.h
- create mode 100644 m4/mbfile.m4
- create mode 100755 tests/expand/mb.sh
- create mode 100755 tests/i18n/sort.sh
- create mode 100755 tests/misc/sort-mb-tests.sh
- create mode 100755 tests/unexpand/mb.sh
- diff --git a/bootstrap.conf b/bootstrap.conf
- index c1399e3..60b39cf 100644
- --- a/bootstrap.conf
- +++ b/bootstrap.conf
- @@ -162,6 +162,7 @@ gnulib_modules="
- maintainer-makefile
- malloc-gnu
- manywarnings
- + mbfile
- mbrlen
- mbrtowc
- mbsalign
- diff --git a/configure.ac b/configure.ac
- index 7e4afc9..4656a35 100644
- --- a/configure.ac
- +++ b/configure.ac
- @@ -476,6 +476,8 @@ fi
- # I'm leaving it here for now. This whole thing needs to be modernized...
- gl_WINSIZE_IN_PTEM
- +gl_MBFILE
- +
- gl_HEADER_TIOCGWINSZ_IN_TERMIOS_H
- if test $gl_cv_sys_tiocgwinsz_needs_termios_h = no && \
- diff --git a/lib/linebuffer.h b/lib/linebuffer.h
- index 07d45ca..af62e6c 100644
- --- a/lib/linebuffer.h
- +++ b/lib/linebuffer.h
- @@ -22,6 +22,11 @@
- # include "idx.h"
- # include <stdio.h>
- +/* Get mbstate_t. */
- +# if HAVE_WCHAR_H
- +# include <wchar.h>
- +# endif
- +
- /* A 'struct linebuffer' holds a line of text. */
- struct linebuffer
- @@ -29,6 +34,9 @@ struct linebuffer
- idx_t size; /* Allocated. */
- idx_t length; /* Used. */
- char *buffer;
- +# if HAVE_WCHAR_H
- + mbstate_t state;
- +# endif
- };
- /* Initialize linebuffer LINEBUFFER for use. */
- diff --git a/lib/mbfile.c b/lib/mbfile.c
- new file mode 100644
- index 0000000..b0a468e
- --- /dev/null
- +++ b/lib/mbfile.c
- @@ -0,0 +1,3 @@
- +#include <config.h>
- +#define MBFILE_INLINE _GL_EXTERN_INLINE
- +#include "mbfile.h"
- diff --git a/lib/mbfile.h b/lib/mbfile.h
- new file mode 100644
- index 0000000..11f1b12
- --- /dev/null
- +++ b/lib/mbfile.h
- @@ -0,0 +1,255 @@
- +/* Multibyte character I/O: macros for multi-byte encodings.
- + Copyright (C) 2001, 2005, 2009-2015 Free Software Foundation, Inc.
- +
- + This program is free software: you can redistribute it and/or modify
- + it under the terms of the GNU General Public License as published by
- + the Free Software Foundation; either version 3 of the License, or
- + (at your option) any later version.
- +
- + This program is distributed in the hope that it will be useful,
- + but WITHOUT ANY WARRANTY; without even the implied warranty of
- + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + GNU General Public License for more details.
- +
- + You should have received a copy of the GNU General Public License
- + along with this program. If not, see <http://www.gnu.org/licenses/>. */
- +
- +/* Written by Mitsuru Chinen <mchinen@yamato.ibm.com>
- + and Bruno Haible <bruno@clisp.org>. */
- +
- +/* The macros in this file implement multi-byte character input from a
- + stream.
- +
- + mb_file_t
- + is the type for multibyte character input stream, usable for variable
- + declarations.
- +
- + mbf_char_t
- + is the type for multibyte character or EOF, usable for variable
- + declarations.
- +
- + mbf_init (mbf, stream)
- + initializes the MB_FILE for reading from stream.
- +
- + mbf_getc (mbc, mbf)
- + reads the next multibyte character from mbf and stores it in mbc.
- +
- + mb_iseof (mbc)
- + returns true if mbc represents the EOF value.
- +
- + Here are the function prototypes of the macros.
- +
- + extern void mbf_init (mb_file_t mbf, FILE *stream);
- + extern void mbf_getc (mbf_char_t mbc, mb_file_t mbf);
- + extern bool mb_iseof (const mbf_char_t mbc);
- + */
- +
- +#ifndef _MBFILE_H
- +#define _MBFILE_H 1
- +
- +#include <assert.h>
- +#include <stdbool.h>
- +#include <stdio.h>
- +#include <string.h>
- +
- +/* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
- + <wchar.h>.
- + BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
- + <wchar.h>. */
- +#include <stdio.h>
- +#include <time.h>
- +#include <wchar.h>
- +
- +#include "mbchar.h"
- +
- +#ifndef _GL_INLINE_HEADER_BEGIN
- + #error "Please include config.h first."
- +#endif
- +_GL_INLINE_HEADER_BEGIN
- +#ifndef MBFILE_INLINE
- +# define MBFILE_INLINE _GL_INLINE
- +#endif
- +
- +struct mbfile_multi {
- + FILE *fp;
- + bool eof_seen;
- + bool have_pushback;
- + mbstate_t state;
- + unsigned int bufcount;
- + char buf[MBCHAR_BUF_SIZE];
- + struct mbchar pushback;
- +};
- +
- +MBFILE_INLINE void
- +mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf)
- +{
- + size_t bytes;
- +
- + /* If EOF has already been seen, don't use getc. This matters if
- + mbf->fp is connected to an interactive tty. */
- + if (mbf->eof_seen)
- + goto eof;
- +
- + /* Return character pushed back, if there is one. */
- + if (mbf->have_pushback)
- + {
- + mb_copy (mbc, &mbf->pushback);
- + mbf->have_pushback = false;
- + return;
- + }
- +
- + /* Before using mbrtowc, we need at least one byte. */
- + if (mbf->bufcount == 0)
- + {
- + int c = getc (mbf->fp);
- + if (c == EOF)
- + {
- + mbf->eof_seen = true;
- + goto eof;
- + }
- + mbf->buf[0] = (unsigned char) c;
- + mbf->bufcount++;
- + }
- +
- + /* Handle most ASCII characters quickly, without calling mbrtowc(). */
- + if (mbf->bufcount == 1 && mbsinit (&mbf->state) && is_basic (mbf->buf[0]))
- + {
- + /* These characters are part of the basic character set. ISO C 99
- + guarantees that their wide character code is identical to their
- + char code. */
- + mbc->wc = mbc->buf[0] = mbf->buf[0];
- + mbc->wc_valid = true;
- + mbc->ptr = &mbc->buf[0];
- + mbc->bytes = 1;
- + mbf->bufcount = 0;
- + return;
- + }
- +
- + /* Use mbrtowc on an increasing number of bytes. Read only as many bytes
- + from mbf->fp as needed. This is needed to give reasonable interactive
- + behaviour when mbf->fp is connected to an interactive tty. */
- + for (;;)
- + {
- + /* We don't know whether the 'mbrtowc' function updates the state when
- + it returns -2, - this is the ISO C 99 and glibc-2.2 behaviour - or
- + not - amended ANSI C, glibc-2.1 and Solaris 2.7 behaviour. We
- + don't have an autoconf test for this, yet.
- + The new behaviour would allow us to feed the bytes one by one into
- + mbrtowc. But the old behaviour forces us to feed all bytes since
- + the end of the last character into mbrtowc. Since we want to retry
- + with more bytes when mbrtowc returns -2, we must backup the state
- + before calling mbrtowc, because implementations with the new
- + behaviour will clobber it. */
- + mbstate_t backup_state = mbf->state;
- +
- + bytes = mbrtowc (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state);
- +
- + if (bytes == (size_t) -1)
- + {
- + /* An invalid multibyte sequence was encountered. */
- + /* Return a single byte. */
- + bytes = 1;
- + mbc->wc_valid = false;
- + break;
- + }
- + else if (bytes == (size_t) -2)
- + {
- + /* An incomplete multibyte character. */
- + mbf->state = backup_state;
- + if (mbf->bufcount == MBCHAR_BUF_SIZE)
- + {
- + /* An overlong incomplete multibyte sequence was encountered. */
- + /* Return a single byte. */
- + bytes = 1;
- + mbc->wc_valid = false;
- + break;
- + }
- + else
- + {
- + /* Read one more byte and retry mbrtowc. */
- + int c = getc (mbf->fp);
- + if (c == EOF)
- + {
- + /* An incomplete multibyte character at the end. */
- + mbf->eof_seen = true;
- + bytes = mbf->bufcount;
- + mbc->wc_valid = false;
- + break;
- + }
- + mbf->buf[mbf->bufcount] = (unsigned char) c;
- + mbf->bufcount++;
- + }
- + }
- + else
- + {
- + if (bytes == 0)
- + {
- + /* A null wide character was encountered. */
- + bytes = 1;
- + assert (mbf->buf[0] == '\0');
- + assert (mbc->wc == 0);
- + }
- + mbc->wc_valid = true;
- + break;
- + }
- + }
- +
- + /* Return the multibyte sequence mbf->buf[0..bytes-1]. */
- + mbc->ptr = &mbc->buf[0];
- + memcpy (&mbc->buf[0], &mbf->buf[0], bytes);
- + mbc->bytes = bytes;
- +
- + mbf->bufcount -= bytes;
- + if (mbf->bufcount > 0)
- + {
- + /* It's not worth calling memmove() for so few bytes. */
- + unsigned int count = mbf->bufcount;
- + char *p = &mbf->buf[0];
- +
- + do
- + {
- + *p = *(p + bytes);
- + p++;
- + }
- + while (--count > 0);
- + }
- + return;
- +
- +eof:
- + /* An mbchar_t with bytes == 0 is used to indicate EOF. */
- + mbc->ptr = NULL;
- + mbc->bytes = 0;
- + mbc->wc_valid = false;
- + return;
- +}
- +
- +MBFILE_INLINE void
- +mbfile_multi_ungetc (const struct mbchar *mbc, struct mbfile_multi *mbf)
- +{
- + mb_copy (&mbf->pushback, mbc);
- + mbf->have_pushback = true;
- +}
- +
- +typedef struct mbfile_multi mb_file_t;
- +
- +typedef mbchar_t mbf_char_t;
- +
- +#define mbf_init(mbf, stream) \
- + ((mbf).fp = (stream), \
- + (mbf).eof_seen = false, \
- + (mbf).have_pushback = false, \
- + memset (&(mbf).state, '\0', sizeof (mbstate_t)), \
- + (mbf).bufcount = 0)
- +
- +#define mbf_getc(mbc, mbf) mbfile_multi_getc (&(mbc), &(mbf))
- +
- +#define mbf_ungetc(mbc, mbf) mbfile_multi_ungetc (&(mbc), &(mbf))
- +
- +#define mb_iseof(mbc) ((mbc).bytes == 0)
- +
- +#ifndef _GL_INLINE_HEADER_BEGIN
- + #error "Please include config.h first."
- +#endif
- +_GL_INLINE_HEADER_BEGIN
- +
- +#endif /* _MBFILE_H */
- diff --git a/m4/mbfile.m4 b/m4/mbfile.m4
- new file mode 100644
- index 0000000..8589902
- --- /dev/null
- +++ b/m4/mbfile.m4
- @@ -0,0 +1,14 @@
- +# mbfile.m4 serial 7
- +dnl Copyright (C) 2005, 2008-2015 Free Software Foundation, Inc.
- +dnl This file is free software; the Free Software Foundation
- +dnl gives unlimited permission to copy and/or distribute it,
- +dnl with or without modifications, as long as this notice is preserved.
- +
- +dnl autoconf tests required for use of mbfile.h
- +dnl From Bruno Haible.
- +
- +AC_DEFUN([gl_MBFILE],
- +[
- + AC_REQUIRE([AC_TYPE_MBSTATE_T])
- + :
- +])
- diff --git a/src/cut.c b/src/cut.c
- index 6fd8978..faef877 100644
- --- a/src/cut.c
- +++ b/src/cut.c
- @@ -28,6 +28,11 @@
- #include <assert.h>
- #include <getopt.h>
- #include <sys/types.h>
- +
- +/* Get mbstate_t, mbrtowc(). */
- +#if HAVE_WCHAR_H
- +# include <wchar.h>
- +#endif
- #include "system.h"
- #include "error.h"
- @@ -37,6 +42,18 @@
- #include "set-fields.h"
- +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
- + installation; work around this configuration error. */
- +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
- +# undef MB_LEN_MAX
- +# define MB_LEN_MAX 16
- +#endif
- +
- +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
- +#if HAVE_MBRTOWC && defined mbstate_t
- +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
- +#endif
- +
- /* The official name of this program (e.g., no 'g' prefix). */
- #define PROGRAM_NAME "cut"
- @@ -53,6 +70,52 @@
- } \
- while (0)
- +/* Refill the buffer BUF to get a multibyte character. */
- +#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \
- + do \
- + { \
- + if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \
- + { \
- + memmove (BUF, BUFPOS, BUFLEN); \
- + BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
- + BUFPOS = BUF; \
- + } \
- + } \
- + while (0)
- +
- +/* Get wide character on BUFPOS. BUFPOS is not included after that.
- + If byte sequence is not valid as a character, CONVFAIL is true. Otherwise false. */
- +#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
- + do \
- + { \
- + mbstate_t state_bak; \
- + \
- + if (BUFLEN < 1) \
- + { \
- + WC = WEOF; \
- + break; \
- + } \
- + \
- + /* Get a wide character. */ \
- + CONVFAIL = false; \
- + state_bak = STATE; \
- + MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE); \
- + \
- + switch (MBLENGTH) \
- + { \
- + case (size_t)-1: \
- + case (size_t)-2: \
- + CONVFAIL = true; \
- + STATE = state_bak; \
- + /* Fall througn. */ \
- + \
- + case 0: \
- + MBLENGTH = 1; \
- + break; \
- + } \
- + } \
- + while (0)
- +
- /* Pointer inside RP. When checking if a byte or field is selected
- by a finite range, we check if it is between CURRENT_RP.LO
- @@ -60,6 +123,9 @@
- CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
- static struct field_range_pair *current_rp;
- +/* Length of the delimiter given as argument to -d. */
- +size_t delimlen;
- +
- /* This buffer is used to support the semantics of the -s option
- (or lack of same) when the specified field list includes (does
- not include) the first field. In both of those cases, the entire
- @@ -72,6 +138,29 @@ static char *field_1_buffer;
- /* The number of bytes allocated for FIELD_1_BUFFER. */
- static size_t field_1_bufsize;
- +enum operating_mode
- + {
- + undefined_mode,
- +
- + /* Output bytes that are at the given positions. */
- + byte_mode,
- +
- + /* Output characters that are at the given positions. */
- + character_mode,
- +
- + /* Output the given delimiter-separated fields. */
- + field_mode
- + };
- +
- +static enum operating_mode operating_mode;
- +
- +/* If nonzero, when in byte mode, don't split multibyte characters. */
- +static int byte_mode_character_aware;
- +
- +/* If nonzero, the function for single byte locale is work
- + if this program runs on multibyte locale. */
- +static int force_singlebyte_mode;
- +
- /* If true do not output lines containing no delimiter characters.
- Otherwise, all such lines are printed. This option is valid only
- with field mode. */
- @@ -83,10 +172,16 @@ static bool complement;
- /* The delimiter character for field mode. */
- static unsigned char delim;
- +#if HAVE_WCHAR_H
- +static wchar_t wcdelim;
- +#endif
- /* The delimiter for each line/record. */
- static unsigned char line_delim = '\n';
- +/* True if the --output-delimiter=STRING option was specified. */
- +static bool output_delimiter_specified;
- +
- /* The length of output_delimiter_string. */
- static size_t output_delimiter_length;
- @@ -94,9 +189,6 @@ static size_t output_delimiter_length;
- string consisting of the input delimiter. */
- static char *output_delimiter_string;
- -/* The output delimiter string contents, if the default. */
- -static char output_delimiter_default[1];
- -
- /* True if we have ever read standard input. */
- static bool have_read_stdin;
- @@ -150,7 +242,7 @@ Print selected parts of lines from each FILE to standard output.\n\
- -f, --fields=LIST select only these fields; also print any line\n\
- that contains no delimiter character, unless\n\
- the -s option is specified\n\
- - -n (ignored)\n\
- + -n with -b: don't split multibyte characters\n\
- "), stdout);
- fputs (_("\
- --complement complement the set of selected bytes, characters\n\
- @@ -250,7 +342,7 @@ cut_bytes (FILE *stream)
- next_item (&byte_idx);
- if (print_kth (byte_idx))
- {
- - if (output_delimiter_string != output_delimiter_default)
- + if (output_delimiter_specified)
- {
- if (print_delimiter && is_range_start_index (byte_idx))
- {
- @@ -266,6 +358,82 @@ cut_bytes (FILE *stream)
- }
- }
- +#if HAVE_MBRTOWC
- +/* This function is in use for the following case.
- +
- + 1. Read from the stream STREAM, printing to standard output any selected
- + characters.
- +
- + 2. Read from stream STREAM, printing to standard output any selected bytes,
- + without splitting multibyte characters. */
- +
- +static void
- +cut_characters_or_cut_bytes_no_split (FILE *stream)
- +{
- + uintmax_t idx; /* number of bytes or characters in the line so far. */
- + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
- + char *bufpos; /* Next read position of BUF. */
- + size_t buflen; /* The length of the byte sequence in buf. */
- + wint_t wc; /* A gotten wide character. */
- + size_t mblength; /* The byte size of a multibyte character which shows
- + as same character as WC. */
- + mbstate_t state; /* State of the stream. */
- + bool convfail = false; /* true, when conversion failed. Otherwise false. */
- + /* Whether to begin printing delimiters between ranges for the current line.
- + Set after we've begun printing data corresponding to the first range. */
- + bool print_delimiter = false;
- +
- + idx = 0;
- + buflen = 0;
- + bufpos = buf;
- + memset (&state, '\0', sizeof(mbstate_t));
- +
- + current_rp = frp;
- +
- + while (1)
- + {
- + REFILL_BUFFER (buf, bufpos, buflen, stream);
- +
- + GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
- + (void) convfail; /* ignore unused */
- +
- + if (wc == WEOF)
- + {
- + if (idx > 0)
- + putchar (line_delim);
- + break;
- + }
- + else if (wc == line_delim)
- + {
- + putchar (line_delim);
- + idx = 0;
- + print_delimiter = false;
- + current_rp = frp;
- + }
- + else
- + {
- + next_item (&idx);
- + if (print_kth (idx))
- + {
- + if (output_delimiter_specified)
- + {
- + if (print_delimiter && is_range_start_index (idx))
- + {
- + fwrite (output_delimiter_string, sizeof (char),
- + output_delimiter_length, stdout);
- + }
- + print_delimiter = true;
- + }
- + fwrite (bufpos, mblength, sizeof(char), stdout);
- + }
- + }
- +
- + buflen -= mblength;
- + bufpos += mblength;
- + }
- +}
- +#endif
- +
- /* Read from stream STREAM, printing to standard output any selected fields. */
- static void
- @@ -411,11 +579,218 @@ cut_fields (FILE *stream)
- }
- }
- -/* Process file FILE to standard output, using CUT_STREAM.
- +#if HAVE_MBRTOWC
- +static void
- +cut_fields_mb (FILE *stream)
- +{
- + int c;
- + uintmax_t field_idx;
- + int found_any_selected_field;
- + int buffer_first_field;
- + int empty_input;
- + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
- + char *bufpos; /* Next read position of BUF. */
- + size_t buflen; /* The length of the byte sequence in buf. */
- + wint_t wc = 0; /* A gotten wide character. */
- + size_t mblength; /* The byte size of a multibyte character which shows
- + as same character as WC. */
- + mbstate_t state; /* State of the stream. */
- + bool convfail = false; /* true, when conversion failed. Otherwise false. */
- +
- + current_rp = frp;
- +
- + found_any_selected_field = 0;
- + field_idx = 1;
- + bufpos = buf;
- + buflen = 0;
- + memset (&state, '\0', sizeof(mbstate_t));
- +
- + c = getc (stream);
- + empty_input = (c == EOF);
- + if (c != EOF)
- + {
- + ungetc (c, stream);
- + wc = 0;
- + }
- + else
- + wc = WEOF;
- +
- + /* To support the semantics of the -s flag, we may have to buffer
- + all of the first field to determine whether it is `delimited.'
- + But that is unnecessary if all non-delimited lines must be printed
- + and the first field has been selected, or if non-delimited lines
- + must be suppressed and the first field has *not* been selected.
- + That is because a non-delimited line has exactly one field. */
- + buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
- +
- + while (1)
- + {
- + if (field_idx == 1 && buffer_first_field)
- + {
- + int len = 0;
- +
- + while (1)
- + {
- + REFILL_BUFFER (buf, bufpos, buflen, stream);
- +
- + GET_NEXT_WC_FROM_BUFFER
- + (wc, bufpos, buflen, mblength, state, convfail);
- +
- + if (wc == WEOF)
- + break;
- +
- + field_1_buffer = xrealloc (field_1_buffer, len + mblength);
- + memcpy (field_1_buffer + len, bufpos, mblength);
- + len += mblength;
- + buflen -= mblength;
- + bufpos += mblength;
- +
- + if (!convfail && (wc == line_delim || wc == wcdelim))
- + break;
- + }
- +
- + if (len <= 0 && wc == WEOF)
- + break;
- +
- + /* If the first field extends to the end of line (it is not
- + delimited) and we are printing all non-delimited lines,
- + print this one. */
- + if (convfail || (!convfail && wc != wcdelim))
- + {
- + if (suppress_non_delimited)
- + {
- + /* Empty. */
- + }
- + else
- + {
- + fwrite (field_1_buffer, sizeof (char), len, stdout);
- + /* Make sure the output line is newline terminated. */
- + if (convfail || (!convfail && wc != line_delim))
- + putchar (line_delim);
- + }
- + continue;
- + }
- +
- + if (print_kth (1))
- + {
- + /* Print the field, but not the trailing delimiter. */
- + fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
- + found_any_selected_field = 1;
- + }
- + next_item (&field_idx);
- + }
- +
- + if (wc != WEOF)
- + {
- + if (print_kth (field_idx))
- + {
- + if (found_any_selected_field)
- + {
- + fwrite (output_delimiter_string, sizeof (char),
- + output_delimiter_length, stdout);
- + }
- + found_any_selected_field = 1;
- + }
- +
- + while (1)
- + {
- + REFILL_BUFFER (buf, bufpos, buflen, stream);
- +
- + GET_NEXT_WC_FROM_BUFFER
- + (wc, bufpos, buflen, mblength, state, convfail);
- +
- + if (wc == WEOF)
- + break;
- + else if (!convfail && (wc == wcdelim || wc == line_delim))
- + {
- + buflen -= mblength;
- + bufpos += mblength;
- + break;
- + }
- +
- + if (print_kth (field_idx))
- + fwrite (bufpos, mblength, sizeof(char), stdout);
- +
- + buflen -= mblength;
- + bufpos += mblength;
- + }
- + }
- +
- + if ((!convfail || wc == line_delim) && buflen < 1)
- + wc = WEOF;
- +
- + if (!convfail && wc == wcdelim)
- + next_item (&field_idx);
- + else if (wc == WEOF || (!convfail && wc == line_delim))
- + {
- + if (found_any_selected_field
- + || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
- + putchar (line_delim);
- + if (wc == WEOF)
- + break;
- + field_idx = 1;
- + current_rp = frp;
- + found_any_selected_field = 0;
- + }
- + }
- +}
- +#endif
- +
- +static void
- +cut_stream (FILE *stream)
- +{
- +#if HAVE_MBRTOWC
- + if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
- + {
- + switch (operating_mode)
- + {
- + case byte_mode:
- + if (byte_mode_character_aware)
- + cut_characters_or_cut_bytes_no_split (stream);
- + else
- + cut_bytes (stream);
- + break;
- +
- + case character_mode:
- + cut_characters_or_cut_bytes_no_split (stream);
- + break;
- +
- + case field_mode:
- + if (delimlen == 1)
- + {
- + /* Check if we have utf8 multibyte locale, so we can use this
- + optimization because of uniqueness of characters, which is
- + not true for e.g. SJIS */
- + char * loc = setlocale(LC_CTYPE, NULL);
- + if (loc && (strstr (loc, "UTF-8") || strstr (loc, "utf-8") ||
- + strstr (loc, "UTF8") || strstr (loc, "utf8")))
- + {
- + cut_fields (stream);
- + break;
- + }
- + }
- + cut_fields_mb (stream);
- + break;
- +
- + default:
- + abort ();
- + }
- + }
- + else
- +#endif
- + {
- + if (operating_mode == field_mode)
- + cut_fields (stream);
- + else
- + cut_bytes (stream);
- + }
- +}
- +
- +/* Process file FILE to standard output.
- Return true if successful. */
- static bool
- -cut_file (char const *file, void (*cut_stream) (FILE *))
- +cut_file (char const *file)
- {
- FILE *stream;
- @@ -459,8 +834,8 @@ main (int argc, char **argv)
- int optc;
- bool ok;
- bool delim_specified = false;
- - bool byte_mode = false;
- - char *spec_list_string = NULL;
- + char *spec_list_string IF_LINT ( = NULL);
- + char mbdelim[MB_LEN_MAX + 1];
- initialize_main (&argc, &argv);
- set_program_name (argv[0]);
- @@ -470,6 +845,8 @@ main (int argc, char **argv)
- atexit (close_stdout);
- + operating_mode = undefined_mode;
- +
- /* By default, all non-delimited lines are printed. */
- suppress_non_delimited = false;
- @@ -481,35 +858,77 @@ main (int argc, char **argv)
- switch (optc)
- {
- case 'b':
- - case 'c':
- /* Build the byte list. */
- - byte_mode = true;
- - FALLTHROUGH;
- + if (operating_mode != undefined_mode)
- + FATAL_ERROR (_("only one type of list may be specified"));
- + operating_mode = byte_mode;
- + spec_list_string = optarg;
- + break;
- +
- + case 'c':
- + /* Build the character list. */
- + if (operating_mode != undefined_mode)
- + FATAL_ERROR (_("only one type of list may be specified"));
- + operating_mode = character_mode;
- + spec_list_string = optarg;
- + break;
- +
- case 'f':
- /* Build the field list. */
- - if (spec_list_string)
- - FATAL_ERROR (_("only one list may be specified"));
- + if (operating_mode != undefined_mode)
- + FATAL_ERROR (_("only one type of list may be specified"));
- + operating_mode = field_mode;
- spec_list_string = optarg;
- break;
- case 'd':
- /* New delimiter. */
- /* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */
- - if (optarg[0] != '\0' && optarg[1] != '\0')
- - FATAL_ERROR (_("the delimiter must be a single character"));
- - delim = optarg[0];
- - delim_specified = true;
- + {
- +#if HAVE_MBRTOWC
- + if(MB_CUR_MAX > 1)
- + {
- + mbstate_t state;
- +
- + memset (&state, '\0', sizeof(mbstate_t));
- + delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state);
- +
- + if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
- + ++force_singlebyte_mode;
- + else
- + {
- + delimlen = (delimlen < 1) ? 1 : delimlen;
- + if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
- + FATAL_ERROR (_("the delimiter must be a single character"));
- + memcpy (mbdelim, optarg, delimlen);
- + mbdelim[delimlen] = '\0';
- + if (delimlen == 1)
- + delim = *optarg;
- + }
- + }
- +
- + if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
- +#endif
- + {
- + if (optarg[0] != '\0' && optarg[1] != '\0')
- + FATAL_ERROR (_("the delimiter must be a single character"));
- + delim = (unsigned char) optarg[0];
- + }
- + delim_specified = true;
- + }
- break;
- case OUTPUT_DELIMITER_OPTION:
- + output_delimiter_specified = true;
- /* Interpret --output-delimiter='' to mean
- 'use the NUL byte as the delimiter.' */
- output_delimiter_length = (optarg[0] == '\0'
- ? 1 : strlen (optarg));
- - output_delimiter_string = optarg;
- + output_delimiter_string = xstrdup (optarg);
- break;
- case 'n':
- + byte_mode_character_aware = 1;
- break;
- case 's':
- @@ -533,40 +952,57 @@ main (int argc, char **argv)
- }
- }
- - if (!spec_list_string)
- + if (operating_mode == undefined_mode)
- FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
- - if (byte_mode)
- - {
- - if (delim_specified)
- - FATAL_ERROR (_("an input delimiter may be specified only\
- + if (delim_specified && operating_mode != field_mode)
- + FATAL_ERROR (_("an input delimiter may be specified only\
- when operating on fields"));
- - if (suppress_non_delimited)
- - FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
- + if (suppress_non_delimited && operating_mode != field_mode)
- + FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
- \tonly when operating on fields"));
- - }
- set_fields (spec_list_string,
- - ((byte_mode ? SETFLD_ERRMSG_USE_POS : 0)
- - | (complement ? SETFLD_COMPLEMENT : 0)));
- + ( (operating_mode == field_mode) ? 0 : SETFLD_ERRMSG_USE_POS)
- + | (complement ? SETFLD_COMPLEMENT : 0) );
- if (!delim_specified)
- - delim = '\t';
- + {
- + delim = '\t';
- +#ifdef HAVE_MBRTOWC
- + wcdelim = L'\t';
- + mbdelim[0] = '\t';
- + mbdelim[1] = '\0';
- + delimlen = 1;
- +#endif
- + }
- if (output_delimiter_string == NULL)
- {
- - output_delimiter_default[0] = delim;
- - output_delimiter_string = output_delimiter_default;
- - output_delimiter_length = 1;
- +#ifdef HAVE_MBRTOWC
- + if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
- + {
- + output_delimiter_string = xstrdup(mbdelim);
- + output_delimiter_length = delimlen;
- + }
- +
- + if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
- +#endif
- + {
- + static char dummy[2];
- + dummy[0] = delim;
- + dummy[1] = '\0';
- + output_delimiter_string = dummy;
- + output_delimiter_length = 1;
- + }
- }
- - void (*cut_stream) (FILE *) = byte_mode ? cut_bytes : cut_fields;
- if (optind == argc)
- - ok = cut_file ("-", cut_stream);
- + ok = cut_file ("-");
- else
- for (ok = true; optind < argc; optind++)
- - ok &= cut_file (argv[optind], cut_stream);
- + ok &= cut_file (argv[optind]);
- if (have_read_stdin && fclose (stdin) == EOF)
- diff --git a/src/expand-common.c b/src/expand-common.c
- index deec1bd..b39f740 100644
- --- a/src/expand-common.c
- +++ b/src/expand-common.c
- @@ -19,6 +19,7 @@
- #include <assert.h>
- #include <stdio.h>
- #include <sys/types.h>
- +#include <mbfile.h>
- #include "system.h"
- #include "die.h"
- #include "error.h"
- @@ -125,6 +126,119 @@ set_increment_size (uintmax_t tabval)
- return ok;
- }
- +extern int
- +set_utf_locale (void)
- +{
- + /*try using some predefined locale */
- + const char* predef_locales[] = {"C.UTF8","en_US.UTF8","en_GB.UTF8"};
- +
- + const int predef_locales_count=3;
- + for (int i=0;i<predef_locales_count;i++)
- + {
- + if (setlocale(LC_ALL,predef_locales[i])!=NULL)
- + {
- + break;
- + }
- + else if (i==predef_locales_count-1)
- + {
- + return 1;
- + error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale"));
- + }
- + }
- + return 0;
- +}
- +
- +extern bool
- +check_utf_locale(void)
- +{
- + char* locale = setlocale (LC_CTYPE , NULL);
- + if (locale == NULL)
- + {
- + return false;
- + }
- + else if (strcasestr(locale, "utf8") == NULL && strcasestr(locale, "utf-8") == NULL)
- + {
- + return false;
- + }
- + return true;
- +}
- +
- +extern bool
- +check_bom(FILE* fp, mb_file_t *mbf)
- +{
- + int c;
- +
- +
- + c=fgetc(fp);
- +
- + /*test BOM header of the first file */
- + mbf->bufcount=0;
- + if (c == 0xEF)
- + {
- + c=fgetc(fp);
- + }
- + else
- + {
- + if (c != EOF)
- + {
- + ungetc(c,fp);
- + }
- + return false;
- + }
- +
- + if (c == 0xBB)
- + {
- + c=fgetc(fp);
- + }
- + else
- + {
- + if ( c!= EOF )
- + {
- + mbf->buf[0]=(unsigned char) 0xEF;
- + mbf->bufcount=1;
- + ungetc(c,fp);
- + return false;
- + }
- + else
- + {
- + ungetc(0xEF,fp);
- + return false;
- + }
- + }
- + if (c == 0xBF)
- + {
- + mbf->bufcount=0;
- + return true;
- + }
- + else
- + {
- + if (c != EOF)
- + {
- + mbf->buf[0]=(unsigned char) 0xEF;
- + mbf->buf[1]=(unsigned char) 0xBB;
- + mbf->bufcount=2;
- + ungetc(c,fp);
- + return false;
- + }
- + else
- + {
- + mbf->buf[0]=(unsigned char) 0xEF;
- + mbf->bufcount=1;
- + ungetc(0xBB,fp);
- + return false;
- + }
- + }
- + return false;
- +}
- +
- +extern void
- +print_bom(void)
- +{
- + putc (0xEF, stdout);
- + putc (0xBB, stdout);
- + putc (0xBF, stdout);
- +}
- +
- /* Add the comma or blank separated list of tab stops STOPS
- to the list of tab stops. */
- extern void
- diff --git a/src/expand-common.h b/src/expand-common.h
- index 5f59a0e..835b9d5 100644
- --- a/src/expand-common.h
- +++ b/src/expand-common.h
- @@ -25,6 +25,18 @@ extern size_t max_column_width;
- /* The desired exit status. */
- extern int exit_status;
- +extern int
- +set_utf_locale (void);
- +
- +extern bool
- +check_utf_locale(void);
- +
- +extern bool
- +check_bom(FILE* fp, mb_file_t *mbf);
- +
- +extern void
- +print_bom(void);
- +
- /* Add tab stop TABVAL to the end of 'tab_list'. */
- extern void
- add_tab_stop (uintmax_t tabval);
- diff --git a/src/expand.c b/src/expand.c
- index ed78ca8..a4cefa1 100644
- --- a/src/expand.c
- +++ b/src/expand.c
- @@ -37,6 +37,9 @@
- #include <stdio.h>
- #include <getopt.h>
- #include <sys/types.h>
- +
- +#include <mbfile.h>
- +
- #include "system.h"
- #include "die.h"
- @@ -97,19 +100,41 @@ expand (void)
- {
- /* Input stream. */
- FILE *fp = next_file (NULL);
- + mb_file_t mbf;
- + mbf_char_t c;
- + /* True if the starting locale is utf8. */
- + bool using_utf_locale;
- +
- + /* True if the first file contains BOM header. */
- + bool found_bom;
- + using_utf_locale=check_utf_locale();
- if (!fp)
- return;
- + mbf_init (mbf, fp);
- + found_bom=check_bom(fp,&mbf);
- - while (true)
- + if (using_utf_locale == false && found_bom == true)
- + {
- + /*try using some predefined locale */
- +
- + if (set_utf_locale () != 0)
- {
- - /* Input character, or EOF. */
- - int c;
- + error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale"));
- + }
- + }
- +
- +
- + if (found_bom == true)
- + {
- + print_bom();
- + }
- + while (true)
- + {
- /* If true, perform translations. */
- bool convert = true;
- -
- /* The following variables have valid values only when CONVERT
- is true: */
- @@ -119,17 +144,48 @@ expand (void)
- /* Index in TAB_LIST of next tab stop to examine. */
- size_t tab_index = 0;
- -
- /* Convert a line of text. */
- do
- {
- - while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
- - continue;
- + while (true) {
- + mbf_getc (c, mbf);
- + if ((mb_iseof (c)) && (fp = next_file (fp)))
- + {
- + mbf_init (mbf, fp);
- + if (fp!=NULL)
- + {
- + if (check_bom(fp,&mbf)==true)
- + {
- + /*Not the first file - check BOM header*/
- + if (using_utf_locale==false && found_bom==false)
- + {
- + /*BOM header in subsequent file but not in the first one. */
- + error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
- + }
- + }
- + else
- + {
- + if(using_utf_locale==false && found_bom==true)
- + {
- + /*First file conatined BOM header - locale was switched to UTF
- + *all subsequent files should contain BOM. */
- + error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
- + }
- + }
- + }
- + continue;
- + }
- + else
- + {
- + break;
- + }
- + }
- +
- if (convert)
- {
- - if (c == '\t')
- + if (mb_iseq (c, '\t'))
- {
- /* Column the next input tab stop is on. */
- uintmax_t next_tab_column;
- @@ -148,32 +204,34 @@ expand (void)
- if (putchar (' ') < 0)
- die (EXIT_FAILURE, errno, _("write error"));
- - c = ' ';
- + mb_setascii (&c, ' ');
- }
- - else if (c == '\b')
- + else if (mb_iseq (c, '\b'))
- {
- /* Go back one column, and force recalculation of the
- next tab stop. */
- column -= !!column;
- tab_index -= !!tab_index;
- }
- - else
- + /* A leading control character could make us trip over. */
- + else if (!mb_iscntrl (c))
- {
- - column++;
- + column += mb_width (c);
- if (!column)
- die (EXIT_FAILURE, 0, _("input line is too long"));
- }
- - convert &= convert_entire_line || !! isblank (c);
- + convert &= convert_entire_line || mb_isblank (c);
- }
- - if (c < 0)
- + if (mb_iseof (c))
- return;
- - if (putchar (c) < 0)
- + mb_putc (c, stdout);
- + if (ferror (stdout))
- die (EXIT_FAILURE, errno, _("write error"));
- }
- - while (c != '\n');
- + while (!mb_iseq (c, '\n'));
- }
- }
- diff --git a/src/fold.c b/src/fold.c
- index f07a90b..d32dbfd 100644
- --- a/src/fold.c
- +++ b/src/fold.c
- @@ -22,12 +22,34 @@
- #include <getopt.h>
- #include <sys/types.h>
- +/* Get mbstate_t, mbrtowc(), wcwidth(). */
- +#if HAVE_WCHAR_H
- +# include <wchar.h>
- +#endif
- +
- +/* Get iswprint(), iswblank(), wcwidth(). */
- +#if HAVE_WCTYPE_H
- +# include <wctype.h>
- +#endif
- +
- #include "system.h"
- #include "die.h"
- #include "error.h"
- #include "fadvise.h"
- #include "xdectoint.h"
- +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
- + installation; work around this configuration error. */
- +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
- +# undef MB_LEN_MAX
- +# define MB_LEN_MAX 16
- +#endif
- +
- +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
- +#if HAVE_MBRTOWC && defined mbstate_t
- +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
- +#endif
- +
- #define TAB_WIDTH 8
- /* The official name of this program (e.g., no 'g' prefix). */
- @@ -35,20 +57,41 @@
- #define AUTHORS proper_name ("David MacKenzie")
- +#define FATAL_ERROR(Message) \
- + do \
- + { \
- + error (0, 0, (Message)); \
- + usage (2); \
- + } \
- + while (0)
- +
- +enum operating_mode
- +{
- + /* Fold texts by columns that are at the given positions. */
- + column_mode,
- +
- + /* Fold texts by bytes that are at the given positions. */
- + byte_mode,
- +
- + /* Fold texts by characters that are at the given positions. */
- + character_mode,
- +};
- +
- +/* The argument shows current mode. (Default: column_mode) */
- +static enum operating_mode operating_mode;
- +
- /* If nonzero, try to break on whitespace. */
- static bool break_spaces;
- -/* If nonzero, count bytes, not column positions. */
- -static bool count_bytes;
- -
- /* If nonzero, at least one of the files we read was standard input. */
- static bool have_read_stdin;
- -static char const shortopts[] = "bsw:0::1::2::3::4::5::6::7::8::9::";
- +static char const shortopts[] = "bcsw:0::1::2::3::4::5::6::7::8::9::";
- static struct option const longopts[] =
- {
- {"bytes", no_argument, NULL, 'b'},
- + {"characters", no_argument, NULL, 'c'},
- {"spaces", no_argument, NULL, 's'},
- {"width", required_argument, NULL, 'w'},
- {GETOPT_HELP_OPTION_DECL},
- @@ -76,6 +119,7 @@ Wrap input lines in each FILE, writing to standard output.\n\
- fputs (_("\
- -b, --bytes count bytes rather than columns\n\
- + -c, --characters count characters rather than columns\n\
- -s, --spaces break at spaces\n\
- -w, --width=WIDTH use WIDTH columns instead of 80\n\
- "), stdout);
- @@ -93,7 +137,7 @@ Wrap input lines in each FILE, writing to standard output.\n\
- static size_t
- adjust_column (size_t column, char c)
- {
- - if (!count_bytes)
- + if (operating_mode != byte_mode)
- {
- if (c == '\b')
- {
- @@ -116,30 +160,14 @@ adjust_column (size_t column, char c)
- to stdout, with maximum line length WIDTH.
- Return true if successful. */
- -static bool
- -fold_file (char const *filename, size_t width)
- +static void
- +fold_text (FILE *istream, size_t width, int *saved_errno)
- {
- - FILE *istream;
- int c;
- size_t column = 0; /* Screen column where next char will go. */
- size_t offset_out = 0; /* Index in 'line_out' for next char. */
- static char *line_out = NULL;
- static size_t allocated_out = 0;
- - int saved_errno;
- -
- - if (STREQ (filename, "-"))
- - {
- - istream = stdin;
- - have_read_stdin = true;
- - }
- - else
- - istream = fopen (filename, "r");
- -
- - if (istream == NULL)
- - {
- - error (0, errno, "%s", quotef (filename));
- - return false;
- - }
- fadvise (istream, FADVISE_SEQUENTIAL);
- @@ -169,6 +197,15 @@ fold_file (char const *filename, size_t width)
- bool found_blank = false;
- size_t logical_end = offset_out;
- + /* If LINE_OUT has no wide character,
- + put a new wide character in LINE_OUT
- + if column is bigger than width. */
- + if (offset_out == 0)
- + {
- + line_out[offset_out++] = c;
- + continue;
- + }
- +
- /* Look for the last blank. */
- while (logical_end)
- {
- @@ -215,13 +252,225 @@ fold_file (char const *filename, size_t width)
- line_out[offset_out++] = c;
- }
- - saved_errno = errno;
- + *saved_errno = errno;
- if (!ferror (istream))
- - saved_errno = 0;
- + *saved_errno = 0;
- if (offset_out)
- fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
- +}
- +
- +#if HAVE_MBRTOWC
- +static void
- +fold_multibyte_text (FILE *istream, size_t width, int *saved_errno)
- +{
- + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
- + size_t buflen = 0; /* The length of the byte sequence in buf. */
- + char *bufpos = buf; /* Next read position of BUF. */
- + wint_t wc; /* A gotten wide character. */
- + size_t mblength; /* The byte size of a multibyte character which shows
- + as same character as WC. */
- + mbstate_t state, state_bak; /* State of the stream. */
- + int convfail = 0; /* 1, when conversion is failed. Otherwise 0. */
- +
- + static char *line_out = NULL;
- + size_t offset_out = 0; /* Index in `line_out' for next char. */
- + static size_t allocated_out = 0;
- +
- + int increment;
- + size_t column = 0;
- +
- + size_t last_blank_pos;
- + size_t last_blank_column;
- + int is_blank_seen;
- + int last_blank_increment = 0;
- + int is_bs_following_last_blank;
- + size_t bs_following_last_blank_num;
- + int is_cr_after_last_blank;
- +
- +#define CLEAR_FLAGS \
- + do \
- + { \
- + last_blank_pos = 0; \
- + last_blank_column = 0; \
- + is_blank_seen = 0; \
- + is_bs_following_last_blank = 0; \
- + bs_following_last_blank_num = 0; \
- + is_cr_after_last_blank = 0; \
- + } \
- + while (0)
- +
- +#define START_NEW_LINE \
- + do \
- + { \
- + putchar ('\n'); \
- + column = 0; \
- + offset_out = 0; \
- + CLEAR_FLAGS; \
- + } \
- + while (0)
- +
- + CLEAR_FLAGS;
- + memset (&state, '\0', sizeof(mbstate_t));
- +
- + for (;; bufpos += mblength, buflen -= mblength)
- + {
- + if (buflen < MB_LEN_MAX && !feof (istream) && !ferror (istream))
- + {
- + memmove (buf, bufpos, buflen);
- + buflen += fread (buf + buflen, sizeof(char), BUFSIZ, istream);
- + bufpos = buf;
- + }
- +
- + if (buflen < 1)
- + break;
- +
- + /* Get a wide character. */
- + state_bak = state;
- + mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &state);
- +
- + switch (mblength)
- + {
- + case (size_t)-1:
- + case (size_t)-2:
- + convfail++;
- + state = state_bak;
- + /* Fall through. */
- +
- + case 0:
- + mblength = 1;
- + break;
- + }
- +
- +rescan:
- + if (operating_mode == byte_mode) /* byte mode */
- + increment = mblength;
- + else if (operating_mode == character_mode) /* character mode */
- + increment = 1;
- + else /* column mode */
- + {
- + if (convfail)
- + increment = 1;
- + else
- + {
- + switch (wc)
- + {
- + case L'\n':
- + fwrite (line_out, sizeof(char), offset_out, stdout);
- + START_NEW_LINE;
- + continue;
- +
- + case L'\b':
- + increment = (column > 0) ? -1 : 0;
- + break;
- +
- + case L'\r':
- + increment = -1 * column;
- + break;
- +
- + case L'\t':
- + increment = 8 - column % 8;
- + break;
- +
- + default:
- + increment = wcwidth (wc);
- + increment = (increment < 0) ? 0 : increment;
- + }
- + }
- + }
- +
- + if (column + increment > width && break_spaces && last_blank_pos)
- + {
- + fwrite (line_out, sizeof(char), last_blank_pos, stdout);
- + putchar ('\n');
- +
- + offset_out = offset_out - last_blank_pos;
- + column = column - last_blank_column + ((is_cr_after_last_blank)
- + ? last_blank_increment : bs_following_last_blank_num);
- + memmove (line_out, line_out + last_blank_pos, offset_out);
- + CLEAR_FLAGS;
- + goto rescan;
- + }
- +
- + if (column + increment > width && column != 0)
- + {
- + fwrite (line_out, sizeof(char), offset_out, stdout);
- + START_NEW_LINE;
- + goto rescan;
- + }
- +
- + if (allocated_out < offset_out + mblength)
- + {
- + line_out = X2REALLOC (line_out, &allocated_out);
- + }
- +
- + memcpy (line_out + offset_out, bufpos, mblength);
- + offset_out += mblength;
- + column += increment;
- +
- + if (is_blank_seen && !convfail && wc == L'\r')
- + is_cr_after_last_blank = 1;
- +
- + if (is_bs_following_last_blank && !convfail && wc == L'\b')
- + ++bs_following_last_blank_num;
- + else
- + is_bs_following_last_blank = 0;
- +
- + if (break_spaces && !convfail && iswblank (wc))
- + {
- + last_blank_pos = offset_out;
- + last_blank_column = column;
- + is_blank_seen = 1;
- + last_blank_increment = increment;
- + is_bs_following_last_blank = 1;
- + bs_following_last_blank_num = 0;
- + is_cr_after_last_blank = 0;
- + }
- + }
- +
- + *saved_errno = errno;
- + if (!ferror (istream))
- + *saved_errno = 0;
- +
- + if (offset_out)
- + fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
- +
- +}
- +#endif
- +
- +/* Fold file FILENAME, or standard input if FILENAME is "-",
- + to stdout, with maximum line length WIDTH.
- + Return 0 if successful, 1 if an error occurs. */
- +
- +static bool
- +fold_file (char const *filename, size_t width)
- +{
- + FILE *istream;
- + int saved_errno;
- +
- + if (STREQ (filename, "-"))
- + {
- + istream = stdin;
- + have_read_stdin = 1;
- + }
- + else
- + istream = fopen (filename, "r");
- +
- + if (istream == NULL)
- + {
- + error (0, errno, "%s", filename);
- + return 1;
- + }
- +
- + /* Define how ISTREAM is being folded. */
- +#if HAVE_MBRTOWC
- + if (MB_CUR_MAX > 1)
- + fold_multibyte_text (istream, width, &saved_errno);
- + else
- +#endif
- + fold_text (istream, width, &saved_errno);
- +
- if (STREQ (filename, "-"))
- clearerr (istream);
- else if (fclose (istream) != 0 && !saved_errno)
- @@ -252,7 +501,8 @@ main (int argc, char **argv)
- atexit (close_stdout);
- - break_spaces = count_bytes = have_read_stdin = false;
- + operating_mode = column_mode;
- + break_spaces = have_read_stdin = false;
- while ((optc = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1)
- {
- @@ -261,7 +511,15 @@ main (int argc, char **argv)
- switch (optc)
- {
- case 'b': /* Count bytes rather than columns. */
- - count_bytes = true;
- + if (operating_mode != column_mode)
- + FATAL_ERROR (_("only one way of folding may be specified"));
- + operating_mode = byte_mode;
- + break;
- +
- + case 'c':
- + if (operating_mode != column_mode)
- + FATAL_ERROR (_("only one way of folding may be specified"));
- + operating_mode = character_mode;
- break;
- case 's': /* Break at word boundaries. */
- diff --git a/src/join.c b/src/join.c
- index f2fd172..6c7d1ed 100644
- --- a/src/join.c
- +++ b/src/join.c
- @@ -22,19 +22,33 @@
- #include <sys/types.h>
- #include <getopt.h>
- +/* Get mbstate_t, mbrtowc(), mbrtowc(), wcwidth(). */
- +#if HAVE_WCHAR_H
- +# include <wchar.h>
- +#endif
- +
- +/* Get iswblank(), towupper. */
- +#if HAVE_WCTYPE_H
- +# include <wctype.h>
- +#endif
- +
- #include "system.h"
- #include "die.h"
- #include "error.h"
- #include "fadvise.h"
- #include "hard-locale.h"
- #include "linebuffer.h"
- -#include "memcasecmp.h"
- #include "quote.h"
- #include "stdio--.h"
- #include "xmemcoll.h"
- #include "xstrtol.h"
- #include "argmatch.h"
- +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
- +#if HAVE_MBRTOWC && defined mbstate_t
- +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
- +#endif
- +
- /* The official name of this program (e.g., no 'g' prefix). */
- #define PROGRAM_NAME "join"
- @@ -136,10 +150,12 @@ static struct outlist outlist_head;
- /* Last element in 'outlist', where a new element can be added. */
- static struct outlist *outlist_end = &outlist_head;
- -/* Tab character separating fields. If negative, fields are separated
- - by any nonempty string of blanks, otherwise by exactly one
- - tab character whose value (when cast to unsigned char) equals TAB. */
- -static int tab = -1;
- +/* Tab character separating fields. If NULL, fields are separated
- + by any nonempty string of blanks. */
- +static char *tab = NULL;
- +
- +/* The number of bytes used for tab. */
- +static size_t tablen = 0;
- /* If nonzero, check that the input is correctly ordered. */
- static enum
- @@ -280,13 +296,14 @@ xfields (struct line *line)
- if (ptr == lim)
- return;
- - if (0 <= tab && tab != '\n')
- + if (tab != NULL)
- {
- + unsigned char t = tab[0];
- char *sep;
- - for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1)
- + for (; (sep = memchr (ptr, t, lim - ptr)) != NULL; ptr = sep + 1)
- extract_field (line, ptr, sep - ptr);
- }
- - else if (tab < 0)
- + else
- {
- /* Skip leading blanks before the first field. */
- while (field_sep (*ptr))
- @@ -310,6 +327,147 @@ xfields (struct line *line)
- extract_field (line, ptr, lim - ptr);
- }
- +#if HAVE_MBRTOWC
- +static void
- +xfields_multibyte (struct line *line)
- +{
- + char *ptr = line->buf.buffer;
- + char const *lim = ptr + line->buf.length - 1;
- + wchar_t wc = 0;
- + size_t mblength = 1;
- + mbstate_t state, state_bak;
- +
- + memset (&state, 0, sizeof (mbstate_t));
- +
- + if (ptr >= lim)
- + return;
- +
- + if (tab != NULL)
- + {
- + char *sep = ptr;
- + for (; ptr < lim; ptr = sep + mblength)
- + {
- + sep = ptr;
- + while (sep < lim)
- + {
- + state_bak = state;
- + mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
- +
- + if (mblength == (size_t)-1 || mblength == (size_t)-2)
- + {
- + mblength = 1;
- + state = state_bak;
- + }
- + mblength = (mblength < 1) ? 1 : mblength;
- +
- + if (mblength == tablen && !memcmp (sep, tab, mblength))
- + break;
- + else
- + {
- + sep += mblength;
- + continue;
- + }
- + }
- +
- + if (sep >= lim)
- + break;
- +
- + extract_field (line, ptr, sep - ptr);
- + }
- + }
- + else
- + {
- + /* Skip leading blanks before the first field. */
- + while(ptr < lim)
- + {
- + state_bak = state;
- + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
- +
- + if (mblength == (size_t)-1 || mblength == (size_t)-2)
- + {
- + mblength = 1;
- + state = state_bak;
- + break;
- + }
- + mblength = (mblength < 1) ? 1 : mblength;
- +
- + if (!iswblank(wc) && wc != '\n')
- + break;
- + ptr += mblength;
- + }
- +
- + do
- + {
- + char *sep;
- + state_bak = state;
- + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
- + if (mblength == (size_t)-1 || mblength == (size_t)-2)
- + {
- + mblength = 1;
- + state = state_bak;
- + break;
- + }
- + mblength = (mblength < 1) ? 1 : mblength;
- +
- + sep = ptr + mblength;
- + while (sep < lim)
- + {
- + state_bak = state;
- + mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
- + if (mblength == (size_t)-1 || mblength == (size_t)-2)
- + {
- + mblength = 1;
- + state = state_bak;
- + break;
- + }
- + mblength = (mblength < 1) ? 1 : mblength;
- +
- + if (iswblank (wc) || wc == '\n')
- + break;
- +
- + sep += mblength;
- + }
- +
- + extract_field (line, ptr, sep - ptr);
- + if (sep >= lim)
- + return;
- +
- + state_bak = state;
- + mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
- + if (mblength == (size_t)-1 || mblength == (size_t)-2)
- + {
- + mblength = 1;
- + state = state_bak;
- + break;
- + }
- + mblength = (mblength < 1) ? 1 : mblength;
- +
- + ptr = sep + mblength;
- + while (ptr < lim)
- + {
- + state_bak = state;
- + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
- + if (mblength == (size_t)-1 || mblength == (size_t)-2)
- + {
- + mblength = 1;
- + state = state_bak;
- + break;
- + }
- + mblength = (mblength < 1) ? 1 : mblength;
- +
- + if (!iswblank (wc) && wc != '\n')
- + break;
- +
- + ptr += mblength;
- + }
- + }
- + while (ptr < lim);
- + }
- +
- + extract_field (line, ptr, lim - ptr);
- +}
- +#endif
- +
- static void
- freeline (struct line *line)
- {
- @@ -331,56 +489,133 @@ keycmp (struct line const *line1, struct line const *line2,
- size_t jf_1, size_t jf_2)
- {
- /* Start of field to compare in each file. */
- - char *beg1;
- - char *beg2;
- -
- - size_t len1;
- - size_t len2; /* Length of fields to compare. */
- + char *beg[2];
- + char *copy[2];
- + size_t len[2]; /* Length of fields to compare. */
- int diff;
- + int i, j;
- + int mallocd = 0;
- if (jf_1 < line1->nfields)
- {
- - beg1 = line1->fields[jf_1].beg;
- - len1 = line1->fields[jf_1].len;
- + beg[0] = line1->fields[jf_1].beg;
- + len[0] = line1->fields[jf_1].len;
- }
- else
- {
- - beg1 = NULL;
- - len1 = 0;
- + beg[0] = NULL;
- + len[0] = 0;
- }
- if (jf_2 < line2->nfields)
- {
- - beg2 = line2->fields[jf_2].beg;
- - len2 = line2->fields[jf_2].len;
- + beg[1] = line2->fields[jf_2].beg;
- + len[1] = line2->fields[jf_2].len;
- }
- else
- {
- - beg2 = NULL;
- - len2 = 0;
- + beg[1] = NULL;
- + len[1] = 0;
- }
- - if (len1 == 0)
- - return len2 == 0 ? 0 : -1;
- - if (len2 == 0)
- + if (len[0] == 0)
- + return len[1] == 0 ? 0 : -1;
- + if (len[1] == 0)
- return 1;
- if (ignore_case)
- {
- - /* FIXME: ignore_case does not work with NLS (in particular,
- - with multibyte chars). */
- - diff = memcasecmp (beg1, beg2, MIN (len1, len2));
- +#ifdef HAVE_MBRTOWC
- + if (MB_CUR_MAX > 1)
- + {
- + size_t mblength;
- + wchar_t wc, uwc;
- + mbstate_t state, state_bak;
- +
- + memset (&state, '\0', sizeof (mbstate_t));
- +
- + for (i = 0; i < 2; i++)
- + {
- + mallocd = 1;
- + copy[i] = xmalloc (len[i] + 1);
- + memset (copy[i], '\0',len[i] + 1);
- +
- + for (j = 0; j < MIN (len[0], len[1]);)
- + {
- + state_bak = state;
- + mblength = mbrtowc (&wc, beg[i] + j, len[i] - j, &state);
- +
- + switch (mblength)
- + {
- + case (size_t) -1:
- + case (size_t) -2:
- + state = state_bak;
- + /* Fall through */
- + case 0:
- + mblength = 1;
- + break;
- +
- + default:
- + uwc = towupper (wc);
- +
- + if (uwc != wc)
- + {
- + mbstate_t state_wc;
- + size_t mblen;
- +
- + memset (&state_wc, '\0', sizeof (mbstate_t));
- + mblen = wcrtomb (copy[i] + j, uwc, &state_wc);
- + assert (mblen != (size_t)-1);
- + }
- + else
- + memcpy (copy[i] + j, beg[i] + j, mblength);
- + }
- + j += mblength;
- + }
- + copy[i][j] = '\0';
- + }
- + }
- + else
- +#endif
- + {
- + for (i = 0; i < 2; i++)
- + {
- + mallocd = 1;
- + copy[i] = xmalloc (len[i] + 1);
- +
- + for (j = 0; j < MIN (len[0], len[1]); j++)
- + copy[i][j] = toupper (beg[i][j]);
- +
- + copy[i][j] = '\0';
- + }
- + }
- }
- else
- {
- - if (hard_LC_COLLATE)
- - return xmemcoll (beg1, len1, beg2, len2);
- - diff = memcmp (beg1, beg2, MIN (len1, len2));
- + copy[0] = beg[0];
- + copy[1] = beg[1];
- }
- + if (hard_LC_COLLATE)
- + {
- + diff = xmemcoll ((char *) copy[0], len[0], (char *) copy[1], len[1]);
- +
- + if (mallocd)
- + for (i = 0; i < 2; i++)
- + free (copy[i]);
- +
- + return diff;
- + }
- + diff = memcmp (copy[0], copy[1], MIN (len[0], len[1]));
- +
- + if (mallocd)
- + for (i = 0; i < 2; i++)
- + free (copy[i]);
- +
- +
- if (diff)
- return diff;
- - return len1 < len2 ? -1 : len1 != len2;
- + return len[0] - len[1];
- }
- /* Check that successive input lines PREV and CURRENT from input file
- @@ -472,6 +707,11 @@ get_line (FILE *fp, struct line **linep, int which)
- }
- ++line_no[which - 1];
- +#if HAVE_MBRTOWC
- + if (MB_CUR_MAX > 1)
- + xfields_multibyte (line);
- + else
- +#endif
- xfields (line);
- if (prevline[which - 1])
- @@ -567,21 +807,28 @@ prfield (size_t n, struct line const *line)
- /* Output all the fields in line, other than the join field. */
- +#define PUT_TAB_CHAR \
- + do \
- + { \
- + (tab != NULL) ? \
- + fwrite(tab, sizeof(char), tablen, stdout) : putchar (' '); \
- + } \
- + while (0)
- +
- static void
- prfields (struct line const *line, size_t join_field, size_t autocount)
- {
- size_t i;
- size_t nfields = autoformat ? autocount : line->nfields;
- - char output_separator = tab < 0 ? ' ' : tab;
- for (i = 0; i < join_field && i < nfields; ++i)
- {
- - putchar (output_separator);
- + PUT_TAB_CHAR;
- prfield (i, line);
- }
- for (i = join_field + 1; i < nfields; ++i)
- {
- - putchar (output_separator);
- + PUT_TAB_CHAR;
- prfield (i, line);
- }
- }
- @@ -592,7 +839,6 @@ static void
- prjoin (struct line const *line1, struct line const *line2)
- {
- const struct outlist *outlist;
- - char output_separator = tab < 0 ? ' ' : tab;
- size_t field;
- struct line const *line;
- @@ -626,7 +872,7 @@ prjoin (struct line const *line1, struct line const *line2)
- o = o->next;
- if (o == NULL)
- break;
- - putchar (output_separator);
- + PUT_TAB_CHAR;
- }
- putchar (eolchar);
- }
- @@ -1102,20 +1348,43 @@ main (int argc, char **argv)
- case 't':
- {
- - unsigned char newtab = optarg[0];
- + char *newtab = NULL;
- + size_t newtablen;
- + newtab = xstrdup (optarg);
- +#if HAVE_MBRTOWC
- + if (MB_CUR_MAX > 1)
- + {
- + mbstate_t state;
- +
- + memset (&state, 0, sizeof (mbstate_t));
- + newtablen = mbrtowc (NULL, newtab,
- + strnlen (newtab, MB_LEN_MAX),
- + &state);
- + if (newtablen == (size_t) 0
- + || newtablen == (size_t) -1
- + || newtablen == (size_t) -2)
- + newtablen = 1;
- + }
- + else
- +#endif
- + newtablen = 1;
- if (! newtab)
- - newtab = '\n'; /* '' => process the whole line. */
- + newtab = (char*)"\n"; /* '' => process the whole line. */
- else if (optarg[1])
- {
- - if (STREQ (optarg, "\\0"))
- - newtab = '\0';
- - else
- - die (EXIT_FAILURE, 0, _("multi-character tab %s"),
- - quote (optarg));
- + if (newtablen == 1 && newtab[1])
- + {
- + if (STREQ (newtab, "\\0"))
- + newtab[0] = '\0';
- + }
- + }
- + if (tab != NULL && strcmp (tab, newtab))
- + {
- + free (newtab);
- + die (EXIT_FAILURE, 0, _("incompatible tabs"));
- }
- - if (0 <= tab && tab != newtab)
- - die (EXIT_FAILURE, 0, _("incompatible tabs"));
- tab = newtab;
- + tablen = newtablen;
- }
- break;
- diff --git a/src/local.mk b/src/local.mk
- index e1d15ce..1a5ffaa 100644
- --- a/src/local.mk
- +++ b/src/local.mk
- @@ -434,8 +434,8 @@ src_base32_CPPFLAGS = -DBASE_TYPE=32 $(AM_CPPFLAGS)
- src_basenc_SOURCES = src/basenc.c
- src_basenc_CPPFLAGS = -DBASE_TYPE=42 $(AM_CPPFLAGS)
- -src_expand_SOURCES = src/expand.c src/expand-common.c
- -src_unexpand_SOURCES = src/unexpand.c src/expand-common.c
- +src_expand_SOURCES = src/expand.c src/expand-common.c lib/mbfile.c
- +src_unexpand_SOURCES = src/unexpand.c src/expand-common.c lib/mbfile.c
- src_wc_SOURCES = src/wc.c
- if USE_AVX2_WC_LINECOUNT
- diff --git a/src/pr.c b/src/pr.c
- index 4c17c00..b4fab1c 100644
- --- a/src/pr.c
- +++ b/src/pr.c
- @@ -311,6 +311,24 @@
- #include <getopt.h>
- #include <sys/types.h>
- +
- +/* Get MB_LEN_MAX. */
- +#include <limits.h>
- +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
- + installation; work around this configuration error. */
- +#if !defined MB_LEN_MAX || MB_LEN_MAX == 1
- +# define MB_LEN_MAX 16
- +#endif
- +
- +/* Get MB_CUR_MAX. */
- +#include <stdlib.h>
- +
- +/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
- +/* Get mbstate_t, mbrtowc(), wcwidth(). */
- +#if HAVE_WCHAR_H
- +# include <wchar.h>
- +#endif
- +
- #include "system.h"
- #include "die.h"
- #include "error.h"
- @@ -325,6 +343,18 @@
- #include "xstrtol-error.h"
- #include "xdectoint.h"
- +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
- +#if HAVE_MBRTOWC && defined mbstate_t
- +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
- +#endif
- +
- +#ifndef HAVE_DECL_WCWIDTH
- +"this configure-time declaration test was not run"
- +#endif
- +#if !HAVE_DECL_WCWIDTH
- +extern int wcwidth ();
- +#endif
- +
- /* The official name of this program (e.g., no 'g' prefix). */
- #define PROGRAM_NAME "pr"
- @@ -417,7 +447,20 @@ struct COLUMN
- typedef struct COLUMN COLUMN;
- -static int char_to_clump (char c);
- +/* Funtion pointers to switch functions for single byte locale or for
- + multibyte locale. If multibyte functions do not exist in your sysytem,
- + these pointers always point the function for single byte locale. */
- +static void (*print_char) (char c);
- +static int (*char_to_clump) (char c);
- +
- +/* Functions for single byte locale. */
- +static void print_char_single (char c);
- +static int char_to_clump_single (char c);
- +
- +/* Functions for multibyte locale. */
- +static void print_char_multi (char c);
- +static int char_to_clump_multi (char c);
- +
- static bool read_line (COLUMN *p);
- static bool print_page (void);
- static bool print_stored (COLUMN *p);
- @@ -429,6 +472,7 @@ static void add_line_number (COLUMN *p);
- static void getoptnum (char const *n_str, int min, int *num,
- char const *errfmt);
- static void getoptarg (char *arg, char switch_char, char *character,
- + int *character_length, int *character_width,
- int *number);
- static void print_files (int number_of_files, char **av);
- static void init_parameters (int number_of_files);
- @@ -442,7 +486,6 @@ static void store_char (char c);
- static void pad_down (unsigned int lines);
- static void read_rest_of_line (COLUMN *p);
- static void skip_read (COLUMN *p, int column_number);
- -static void print_char (char c);
- static void cleanup (void);
- static void print_sep_string (void);
- static void separator_string (char const *optarg_S);
- @@ -454,7 +497,7 @@ static COLUMN *column_vector;
- we store the leftmost columns contiguously in buff.
- To print a line from buff, get the index of the first character
- from line_vector[i], and print up to line_vector[i + 1]. */
- -static char *buff;
- +static unsigned char *buff;
- /* Index of the position in buff where the next character
- will be stored. */
- @@ -558,7 +601,7 @@ static int chars_per_column;
- static bool untabify_input = false;
- /* (-e) The input tab character. */
- -static char input_tab_char = '\t';
- +static char input_tab_char[MB_LEN_MAX] = "\t";
- /* (-e) Tabstops are at chars_per_tab, 2*chars_per_tab, 3*chars_per_tab, ...
- where the leftmost column is 1. */
- @@ -568,7 +611,10 @@ static int chars_per_input_tab = 8;
- static bool tabify_output = false;
- /* (-i) The output tab character. */
- -static char output_tab_char = '\t';
- +static char output_tab_char[MB_LEN_MAX] = "\t";
- +
- +/* (-i) The byte length of output tab character. */
- +static int output_tab_char_length = 1;
- /* (-i) The width of the output tab. */
- static int chars_per_output_tab = 8;
- @@ -638,7 +684,13 @@ static int line_number;
- static bool numbered_lines = false;
- /* (-n) Character which follows each line number. */
- -static char number_separator = '\t';
- +static char number_separator[MB_LEN_MAX] = "\t";
- +
- +/* (-n) The byte length of the character which follows each line number. */
- +static int number_separator_length = 1;
- +
- +/* (-n) The character width of the character which follows each line number. */
- +static int number_separator_width = 0;
- /* (-n) line counting starts with 1st line of input file (not with 1st
- line of 1st page printed). */
- @@ -691,6 +743,7 @@ static bool use_col_separator = false;
- -a|COLUMN|-m is a 'space' and with the -J option a 'tab'. */
- static char const *col_sep_string = "";
- static int col_sep_length = 0;
- +static int col_sep_width = 0;
- static char *column_separator = (char *) " ";
- static char *line_separator = (char *) "\t";
- @@ -853,6 +906,13 @@ separator_string (char const *optarg_S)
- integer_overflow ();
- col_sep_length = len;
- col_sep_string = optarg_S;
- +
- +#if HAVE_MBRTOWC
- + if (MB_CUR_MAX > 1)
- + col_sep_width = mbswidth (col_sep_string, 0);
- + else
- +#endif
- + col_sep_width = col_sep_length;
- }
- int
- @@ -877,6 +937,21 @@ main (int argc, char **argv)
- atexit (close_stdout);
- +/* Define which functions are used, the ones for single byte locale or the ones
- + for multibyte locale. */
- +#if HAVE_MBRTOWC
- + if (MB_CUR_MAX > 1)
- + {
- + print_char = print_char_multi;
- + char_to_clump = char_to_clump_multi;
- + }
- + else
- +#endif
- + {
- + print_char = print_char_single;
- + char_to_clump = char_to_clump_single;
- + }
- +
- n_files = 0;
- file_names = (argc > 1
- ? xnmalloc (argc - 1, sizeof (char *))
- @@ -953,8 +1028,12 @@ main (int argc, char **argv)
- break;
- case 'e':
- if (optarg)
- - getoptarg (optarg, 'e', &input_tab_char,
- - &chars_per_input_tab);
- + {
- + int dummy_length, dummy_width;
- +
- + getoptarg (optarg, 'e', input_tab_char, &dummy_length,
- + &dummy_width, &chars_per_input_tab);
- + }
- /* Could check tab width > 0. */
- untabify_input = true;
- break;
- @@ -967,8 +1046,12 @@ main (int argc, char **argv)
- break;
- case 'i':
- if (optarg)
- - getoptarg (optarg, 'i', &output_tab_char,
- - &chars_per_output_tab);
- + {
- + int dummy_width;
- +
- + getoptarg (optarg, 'i', output_tab_char, &output_tab_char_length,
- + &dummy_width, &chars_per_output_tab);
- + }
- /* Could check tab width > 0. */
- tabify_output = true;
- break;
- @@ -986,8 +1069,8 @@ main (int argc, char **argv)
- case 'n':
- numbered_lines = true;
- if (optarg)
- - getoptarg (optarg, 'n', &number_separator,
- - &chars_per_number);
- + getoptarg (optarg, 'n', number_separator, &number_separator_length,
- + &number_separator_width, &chars_per_number);
- break;
- case 'N':
- skip_count = false;
- @@ -1012,6 +1095,7 @@ main (int argc, char **argv)
- /* Reset an additional input of -s, -S dominates -s */
- col_sep_string = "";
- col_sep_length = 0;
- + col_sep_width = 0;
- use_col_separator = true;
- if (optarg)
- separator_string (optarg);
- @@ -1166,10 +1250,45 @@ getoptnum (char const *n_str, int min, int *num, char const *err)
- a number. */
- static void
- -getoptarg (char *arg, char switch_char, char *character, int *number)
- +getoptarg (char *arg, char switch_char, char *character, int *character_length,
- + int *character_width, int *number)
- {
- if (!ISDIGIT (*arg))
- - *character = *arg++;
- + {
- +#ifdef HAVE_MBRTOWC
- + if (MB_CUR_MAX > 1) /* for multibyte locale. */
- + {
- + wchar_t wc;
- + size_t mblength;
- + int width;
- + mbstate_t state = {'\0'};
- +
- + mblength = mbrtowc (&wc, arg, strnlen(arg, MB_LEN_MAX), &state);
- +
- + if (mblength == (size_t)-1 || mblength == (size_t)-2)
- + {
- + *character_length = 1;
- + *character_width = 1;
- + }
- + else
- + {
- + *character_length = (mblength < 1) ? 1 : mblength;
- + width = wcwidth (wc);
- + *character_width = (width < 0) ? 0 : width;
- + }
- +
- + strncpy (character, arg, *character_length);
- + arg += *character_length;
- + }
- + else /* for single byte locale. */
- +#endif
- + {
- + *character = *arg++;
- + *character_length = 1;
- + *character_width = 1;
- + }
- + }
- +
- if (*arg)
- {
- long int tmp_long;
- @@ -1191,6 +1310,11 @@ static void
- init_parameters (int number_of_files)
- {
- int chars_used_by_number = 0;
- + int mb_len = 1;
- +#if HAVE_MBRTOWC
- + if (MB_CUR_MAX > 1)
- + mb_len = MB_LEN_MAX;
- +#endif
- lines_per_body = lines_per_page - lines_per_header - lines_per_footer;
- if (lines_per_body <= 0)
- @@ -1228,7 +1352,7 @@ init_parameters (int number_of_files)
- else
- col_sep_string = column_separator;
- - col_sep_length = 1;
- + col_sep_length = col_sep_width = 1;
- use_col_separator = true;
- }
- /* It's rather pointless to define a TAB separator with column
- @@ -1260,11 +1384,11 @@ init_parameters (int number_of_files)
- + TAB_WIDTH (chars_per_input_tab, chars_per_number); */
- /* Estimate chars_per_text without any margin and keep it constant. */
- - if (number_separator == '\t')
- + if (number_separator[0] == '\t')
- number_width = (chars_per_number
- + TAB_WIDTH (chars_per_default_tab, chars_per_number));
- else
- - number_width = chars_per_number + 1;
- + number_width = chars_per_number + number_separator_width;
- /* The number is part of the column width unless we are
- printing files in parallel. */
- @@ -1273,7 +1397,7 @@ init_parameters (int number_of_files)
- }
- int sep_chars, useful_chars;
- - if (INT_MULTIPLY_WRAPV (columns - 1, col_sep_length, &sep_chars))
- + if (INT_MULTIPLY_WRAPV (columns - 1, col_sep_width, &sep_chars))
- sep_chars = INT_MAX;
- if (INT_SUBTRACT_WRAPV (chars_per_line - chars_used_by_number, sep_chars,
- &useful_chars))
- @@ -1296,7 +1420,7 @@ init_parameters (int number_of_files)
- We've to use 8 as the lower limit, if we use chars_per_default_tab = 8
- to expand a tab which is not an input_tab-char. */
- free (clump_buff);
- - clump_buff = xmalloc (MAX (8, chars_per_input_tab));
- + clump_buff = xmalloc (mb_len * MAX (8, chars_per_input_tab));
- }
- /* Open the necessary files,
- @@ -1402,7 +1526,7 @@ init_funcs (void)
- /* Enlarge p->start_position of first column to use the same form of
- padding_not_printed with all columns. */
- - h = h + col_sep_length;
- + h = h + col_sep_width;
- /* This loop takes care of all but the rightmost column. */
- @@ -1436,7 +1560,7 @@ init_funcs (void)
- }
- else
- {
- - h = h_next + col_sep_length;
- + h = h_next + col_sep_width;
- h_next = h + chars_per_column;
- }
- }
- @@ -1733,9 +1857,9 @@ static void
- align_column (COLUMN *p)
- {
- padding_not_printed = p->start_position;
- - if (col_sep_length < padding_not_printed)
- + if (col_sep_width < padding_not_printed)
- {
- - pad_across_to (padding_not_printed - col_sep_length);
- + pad_across_to (padding_not_printed - col_sep_width);
- padding_not_printed = ANYWHERE;
- }
- @@ -2010,13 +2134,13 @@ store_char (char c)
- /* May be too generous. */
- buff = X2REALLOC (buff, &buff_allocated);
- }
- - buff[buff_current++] = c;
- + buff[buff_current++] = (unsigned char) c;
- }
- static void
- add_line_number (COLUMN *p)
- {
- - int i;
- + int i, j;
- char *s;
- int num_width;
- @@ -2033,22 +2157,24 @@ add_line_number (COLUMN *p)
- /* Tabification is assumed for multiple columns, also for n-separators,
- but 'default n-separator = TAB' hasn't been given priority over
- equal column_width also specified by POSIX. */
- - if (number_separator == '\t')
- + if (number_separator[0] == '\t')
- {
- i = number_width - chars_per_number;
- while (i-- > 0)
- (p->char_func) (' ');
- }
- else
- - (p->char_func) (number_separator);
- + for (j = 0; j < number_separator_length; j++)
- + (p->char_func) (number_separator[j]);
- }
- else
- /* To comply with POSIX, we avoid any expansion of default TAB
- separator with a single column output. No column_width requirement
- has to be considered. */
- {
- - (p->char_func) (number_separator);
- - if (number_separator == '\t')
- + for (j = 0; j < number_separator_length; j++)
- + (p->char_func) (number_separator[j]);
- + if (number_separator[0] == '\t')
- output_position = POS_AFTER_TAB (chars_per_output_tab,
- output_position);
- }
- @@ -2207,7 +2333,7 @@ print_white_space (void)
- while (goal - h_old > 1
- && (h_new = POS_AFTER_TAB (chars_per_output_tab, h_old)) <= goal)
- {
- - putchar (output_tab_char);
- + fwrite (output_tab_char, sizeof(char), output_tab_char_length, stdout);
- h_old = h_new;
- }
- while (++h_old <= goal)
- @@ -2227,6 +2353,7 @@ print_sep_string (void)
- {
- char const *s = col_sep_string;
- int l = col_sep_length;
- + int not_space_flag;
- if (separators_not_printed <= 0)
- {
- @@ -2238,6 +2365,7 @@ print_sep_string (void)
- {
- for (; separators_not_printed > 0; --separators_not_printed)
- {
- + not_space_flag = 0;
- while (l-- > 0)
- {
- /* 3 types of sep_strings: spaces only, spaces and chars,
- @@ -2251,12 +2379,15 @@ print_sep_string (void)
- }
- else
- {
- + not_space_flag = 1;
- if (spaces_not_printed > 0)
- print_white_space ();
- putchar (*s++);
- - ++output_position;
- }
- }
- + if (not_space_flag)
- + output_position += col_sep_width;
- +
- /* sep_string ends with some spaces */
- if (spaces_not_printed > 0)
- print_white_space ();
- @@ -2284,7 +2415,7 @@ print_clump (COLUMN *p, int n, char *clump)
- required number of tabs and spaces. */
- static void
- -print_char (char c)
- +print_char_single (char c)
- {
- if (tabify_output)
- {
- @@ -2308,6 +2439,74 @@ print_char (char c)
- putchar (c);
- }
- +#ifdef HAVE_MBRTOWC
- +static void
- +print_char_multi (char c)
- +{
- + static size_t mbc_pos = 0;
- + static char mbc[MB_LEN_MAX] = {'\0'};
- + static mbstate_t state = {'\0'};
- + mbstate_t state_bak;
- + wchar_t wc;
- + size_t mblength;
- + int width;
- +
- + if (tabify_output)
- + {
- + state_bak = state;
- + mbc[mbc_pos++] = c;
- + mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
- +
- + while (mbc_pos > 0)
- + {
- + switch (mblength)
- + {
- + case (size_t)-2:
- + state = state_bak;
- + return;
- +
- + case (size_t)-1:
- + state = state_bak;
- + ++output_position;
- + putchar (mbc[0]);
- + memmove (mbc, mbc + 1, MB_CUR_MAX - 1);
- + --mbc_pos;
- + break;
- +
- + case 0:
- + mblength = 1;
- +
- + default:
- + if (wc == L' ')
- + {
- + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
- + --mbc_pos;
- + ++spaces_not_printed;
- + return;
- + }
- + else if (spaces_not_printed > 0)
- + print_white_space ();
- +
- + /* Nonprintables are assumed to have width 0, except L'\b'. */
- + if ((width = wcwidth (wc)) < 1)
- + {
- + if (wc == L'\b')
- + --output_position;
- + }
- + else
- + output_position += width;
- +
- + fwrite (mbc, sizeof(char), mblength, stdout);
- + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
- + mbc_pos -= mblength;
- + }
- + }
- + return;
- + }
- + putchar (c);
- +}
- +#endif
- +
- /* Skip to page PAGE before printing.
- PAGE may be larger than total number of pages. */
- @@ -2485,9 +2684,9 @@ read_line (COLUMN *p)
- align_empty_cols = false;
- }
- - if (col_sep_length < padding_not_printed)
- + if (col_sep_width < padding_not_printed)
- {
- - pad_across_to (padding_not_printed - col_sep_length);
- + pad_across_to (padding_not_printed - col_sep_width);
- padding_not_printed = ANYWHERE;
- }
- @@ -2556,7 +2755,7 @@ print_stored (COLUMN *p)
- COLUMN *q;
- int line = p->current_line++;
- - char *first = &buff[line_vector[line]];
- + unsigned char *first = &buff[line_vector[line]];
- /* FIXME
- UMR: Uninitialized memory read:
- * This is occurring while in:
- @@ -2568,7 +2767,7 @@ print_stored (COLUMN *p)
- xmalloc [xmalloc.c:94]
- init_store_cols [pr.c:1648]
- */
- - char *last = &buff[line_vector[line + 1]];
- + unsigned char *last = &buff[line_vector[line + 1]];
- pad_vertically = true;
- @@ -2588,9 +2787,9 @@ print_stored (COLUMN *p)
- }
- }
- - if (col_sep_length < padding_not_printed)
- + if (col_sep_width < padding_not_printed)
- {
- - pad_across_to (padding_not_printed - col_sep_length);
- + pad_across_to (padding_not_printed - col_sep_width);
- padding_not_printed = ANYWHERE;
- }
- @@ -2603,8 +2802,8 @@ print_stored (COLUMN *p)
- if (spaces_not_printed == 0)
- {
- output_position = p->start_position + end_vector[line];
- - if (p->start_position - col_sep_length == chars_per_margin)
- - output_position -= col_sep_length;
- + if (p->start_position - col_sep_width == chars_per_margin)
- + output_position -= col_sep_width;
- }
- return true;
- @@ -2623,7 +2822,7 @@ print_stored (COLUMN *p)
- number of characters is 1.) */
- static int
- -char_to_clump (char c)
- +char_to_clump_single (char c)
- {
- unsigned char uc = c;
- char *s = clump_buff;
- @@ -2633,10 +2832,10 @@ char_to_clump (char c)
- int chars;
- int chars_per_c = 8;
- - if (c == input_tab_char)
- + if (c == input_tab_char[0])
- chars_per_c = chars_per_input_tab;
- - if (c == input_tab_char || c == '\t')
- + if (c == input_tab_char[0] || c == '\t')
- {
- width = TAB_WIDTH (chars_per_c, input_position);
- @@ -2717,6 +2916,164 @@ char_to_clump (char c)
- return chars;
- }
- +#ifdef HAVE_MBRTOWC
- +static int
- +char_to_clump_multi (char c)
- +{
- + static size_t mbc_pos = 0;
- + static char mbc[MB_LEN_MAX] = {'\0'};
- + static mbstate_t state = {'\0'};
- + mbstate_t state_bak;
- + wchar_t wc;
- + size_t mblength;
- + int wc_width;
- + register char *s = clump_buff;
- + register int i, j;
- + char esc_buff[4];
- + int width;
- + int chars;
- + int chars_per_c = 8;
- +
- + state_bak = state;
- + mbc[mbc_pos++] = c;
- + mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
- +
- + width = 0;
- + chars = 0;
- + while (mbc_pos > 0)
- + {
- + switch (mblength)
- + {
- + case (size_t)-2:
- + state = state_bak;
- + return 0;
- +
- + case (size_t)-1:
- + state = state_bak;
- + mblength = 1;
- +
- + if (use_esc_sequence || use_cntrl_prefix)
- + {
- + width = +4;
- + chars = +4;
- + *s++ = '\\';
- + sprintf (esc_buff, "%03o", (unsigned char) mbc[0]);
- + for (i = 0; i <= 2; ++i)
- + *s++ = (int) esc_buff[i];
- + }
- + else
- + {
- + width += 1;
- + chars += 1;
- + *s++ = mbc[0];
- + }
- + break;
- +
- + case 0:
- + mblength = 1;
- + /* Fall through */
- +
- + default:
- + if (memcmp (mbc, input_tab_char, mblength) == 0)
- + chars_per_c = chars_per_input_tab;
- +
- + if (memcmp (mbc, input_tab_char, mblength) == 0 || c == '\t')
- + {
- + int width_inc;
- +
- + width_inc = TAB_WIDTH (chars_per_c, input_position);
- + width += width_inc;
- +
- + if (untabify_input)
- + {
- + for (i = width_inc; i; --i)
- + *s++ = ' ';
- + chars += width_inc;
- + }
- + else
- + {
- + for (i = 0; i < mblength; i++)
- + *s++ = mbc[i];
- + chars += mblength;
- + }
- + }
- + else if ((wc_width = wcwidth (wc)) < 1)
- + {
- + if (use_esc_sequence)
- + {
- + for (i = 0; i < mblength; i++)
- + {
- + width += 4;
- + chars += 4;
- + *s++ = '\\';
- + sprintf (esc_buff, "%03o", (unsigned char) mbc[i]);
- + for (j = 0; j <= 2; ++j)
- + *s++ = (int) esc_buff[j];
- + }
- + }
- + else if (use_cntrl_prefix)
- + {
- + if (wc < 0200)
- + {
- + width += 2;
- + chars += 2;
- + *s++ = '^';
- + *s++ = wc ^ 0100;
- + }
- + else
- + {
- + for (i = 0; i < mblength; i++)
- + {
- + width += 4;
- + chars += 4;
- + *s++ = '\\';
- + sprintf (esc_buff, "%03o", (unsigned char) mbc[i]);
- + for (j = 0; j <= 2; ++j)
- + *s++ = (int) esc_buff[j];
- + }
- + }
- + }
- + else if (wc == L'\b')
- + {
- + width += -1;
- + chars += 1;
- + *s++ = c;
- + }
- + else
- + {
- + width += 0;
- + chars += mblength;
- + for (i = 0; i < mblength; i++)
- + *s++ = mbc[i];
- + }
- + }
- + else
- + {
- + width += wc_width;
- + chars += mblength;
- + for (i = 0; i < mblength; i++)
- + *s++ = mbc[i];
- + }
- + }
- + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
- + mbc_pos -= mblength;
- + }
- +
- + /* Too many backspaces must put us in position 0 -- never negative. */
- + if (width < 0 && input_position == 0)
- + {
- + chars = 0;
- + input_position = 0;
- + }
- + else if (width < 0 && input_position <= -width)
- + input_position = 0;
- + else
- + input_position += width;
- +
- + return chars;
- +}
- +#endif
- +
- /* We've just printed some files and need to clean up things before
- looking for more options and printing the next batch of files.
- diff --git a/src/sort.c b/src/sort.c
- index 3b775d6..a0ba243 100644
- --- a/src/sort.c
- +++ b/src/sort.c
- @@ -29,6 +29,14 @@
- #include <sys/wait.h>
- #include <signal.h>
- #include <assert.h>
- +#if HAVE_WCHAR_H
- +# include <wchar.h>
- +#endif
- +/* Get isw* functions. */
- +#if HAVE_WCTYPE_H
- +# include <wctype.h>
- +#endif
- +
- #include "system.h"
- #include "argmatch.h"
- #include "die.h"
- @@ -159,14 +167,39 @@ static int thousands_sep;
- /* We currently ignore multi-byte grouping chars. */
- static bool thousands_sep_ignored;
- +/* True if -f is specified. */
- +static bool folding;
- +
- /* Nonzero if the corresponding locales are hard. */
- static bool hard_LC_COLLATE;
- -#if HAVE_NL_LANGINFO
- +#if HAVE_LANGINFO_CODESET
- static bool hard_LC_TIME;
- #endif
- #define NONZERO(x) ((x) != 0)
- +/* get a multibyte character's byte length. */
- +#define GET_BYTELEN_OF_CHAR(LIM, PTR, MBLENGTH, STATE) \
- + do \
- + { \
- + wchar_t wc; \
- + mbstate_t state_bak; \
- + \
- + state_bak = STATE; \
- + mblength = mbrtowc (&wc, PTR, LIM - PTR, &STATE); \
- + \
- + switch (MBLENGTH) \
- + { \
- + case (size_t)-1: \
- + case (size_t)-2: \
- + STATE = state_bak; \
- + /* Fall through. */ \
- + case 0: \
- + MBLENGTH = 1; \
- + } \
- + } \
- + while (0)
- +
- /* The kind of blanks for '-b' to skip in various options. */
- enum blanktype { bl_start, bl_end, bl_both };
- @@ -343,13 +376,11 @@ static bool stable;
- /* An int value outside char range. */
- enum { NON_CHAR = CHAR_MAX + 1 };
- -/* If TAB has this value, blanks separate fields. */
- -enum { TAB_DEFAULT = CHAR_MAX + 1 };
- -
- -/* Tab character separating fields. If TAB_DEFAULT, then fields are
- +/* Tab character separating fields. If tab_length is 0, then fields are
- separated by the empty string between a non-blank character and a blank
- character. */
- -static int tab = TAB_DEFAULT;
- +static char tab[MB_LEN_MAX + 1];
- +static size_t tab_length = 0;
- /* Flag to remove consecutive duplicate lines from the output.
- Only the last of a sequence of equal lines will be output. */
- @@ -805,6 +836,46 @@ reap_all (void)
- reap (-1);
- }
- +/* Function pointers. */
- +static void
- +(*inittables) (void);
- +static char *
- +(*begfield) (const struct line*, const struct keyfield *);
- +static char *
- +(*limfield) (const struct line*, const struct keyfield *);
- +static void
- +(*skipblanks) (char **ptr, char *lim);
- +static int
- +(*getmonth) (char const *, size_t, char **);
- +static int
- +(*keycompare) (const struct line *, const struct line *);
- +static int
- +(*numcompare) (const char *, const char *);
- +
- +/* Test for white space multibyte character.
- + Set LENGTH the byte length of investigated multibyte character. */
- +#if HAVE_MBRTOWC
- +static int
- +ismbblank (const char *str, size_t len, size_t *length)
- +{
- + size_t mblength;
- + wchar_t wc;
- + mbstate_t state;
- +
- + memset (&state, '\0', sizeof(mbstate_t));
- + mblength = mbrtowc (&wc, str, len, &state);
- +
- + if (mblength == (size_t)-1 || mblength == (size_t)-2)
- + {
- + *length = 1;
- + return 0;
- + }
- +
- + *length = (mblength < 1) ? 1 : mblength;
- + return iswblank (wc) || wc == '\n';
- +}
- +#endif
- +
- /* Clean up any remaining temporary files. */
- static void
- @@ -1272,7 +1343,7 @@ zaptemp (char const *name)
- free (node);
- }
- -#if HAVE_NL_LANGINFO
- +#if HAVE_LANGINFO_CODESET
- static int
- struct_month_cmp (void const *m1, void const *m2)
- @@ -1287,7 +1358,7 @@ struct_month_cmp (void const *m1, void const *m2)
- /* Initialize the character class tables. */
- static void
- -inittables (void)
- +inittables_uni (void)
- {
- size_t i;
- @@ -1299,7 +1370,7 @@ inittables (void)
- fold_toupper[i] = toupper (i);
- }
- -#if HAVE_NL_LANGINFO
- +#if HAVE_LANGINFO_CODESET
- /* If we're not in the "C" locale, read different names for months. */
- if (hard_LC_TIME)
- {
- @@ -1381,6 +1452,84 @@ specify_nmerge (int oi, char c, char const *s)
- xstrtol_fatal (e, oi, c, long_options, s);
- }
- +#if HAVE_MBRTOWC
- +static void
- +inittables_mb (void)
- +{
- + int i, j, k, l;
- + char *name, *s, *lc_time, *lc_ctype;
- + size_t s_len, mblength;
- + char mbc[MB_LEN_MAX];
- + wchar_t wc, pwc;
- + mbstate_t state_mb, state_wc;
- +
- + lc_time = setlocale (LC_TIME, "");
- + if (lc_time)
- + lc_time = xstrdup (lc_time);
- +
- + lc_ctype = setlocale (LC_CTYPE, "");
- + if (lc_ctype)
- + lc_ctype = xstrdup (lc_ctype);
- +
- + if (lc_time && lc_ctype)
- + /* temporarily set LC_CTYPE to match LC_TIME, so that we can convert
- + * the names of months to upper case */
- + setlocale (LC_CTYPE, lc_time);
- +
- + for (i = 0; i < MONTHS_PER_YEAR; i++)
- + {
- + s = (char *) nl_langinfo (ABMON_1 + i);
- + s_len = strlen (s);
- + monthtab[i].name = name = (char *) xmalloc (s_len + 1);
- + monthtab[i].val = i + 1;
- +
- + memset (&state_mb, '\0', sizeof (mbstate_t));
- + memset (&state_wc, '\0', sizeof (mbstate_t));
- +
- + for (j = 0; j < s_len;)
- + {
- + if (!ismbblank (s + j, s_len - j, &mblength))
- + break;
- + j += mblength;
- + }
- +
- + for (k = 0; j < s_len;)
- + {
- + mblength = mbrtowc (&wc, (s + j), (s_len - j), &state_mb);
- + assert (mblength != (size_t)-1 && mblength != (size_t)-2);
- + if (mblength == 0)
- + break;
- +
- + pwc = towupper (wc);
- + if (pwc == wc)
- + {
- + memcpy (mbc, s + j, mblength);
- + j += mblength;
- + }
- + else
- + {
- + j += mblength;
- + mblength = wcrtomb (mbc, pwc, &state_wc);
- + assert (mblength != (size_t)0 && mblength != (size_t)-1);
- + }
- +
- + for (l = 0; l < mblength; l++)
- + name[k++] = mbc[l];
- + }
- + name[k] = '\0';
- + }
- + qsort ((void *) monthtab, MONTHS_PER_YEAR,
- + sizeof (struct month), struct_month_cmp);
- +
- + if (lc_time && lc_ctype)
- + /* restore the original locales */
- + setlocale (LC_CTYPE, lc_ctype);
- +
- + free (lc_ctype);
- + free (lc_time);
- +}
- +#endif
- +
- /* Specify the amount of main memory to use when sorting. */
- static void
- specify_sort_size (int oi, char c, char const *s)
- @@ -1612,7 +1761,7 @@ buffer_linelim (struct buffer const *buf)
- by KEY in LINE. */
- static char *
- -begfield (struct line const *line, struct keyfield const *key)
- +begfield_uni (const struct line *line, const struct keyfield *key)
- {
- char *ptr = line->text, *lim = ptr + line->length - 1;
- size_t sword = key->sword;
- @@ -1621,10 +1770,10 @@ begfield (struct line const *line, struct keyfield const *key)
- /* The leading field separator itself is included in a field when -t
- is absent. */
- - if (tab != TAB_DEFAULT)
- + if (tab_length)
- while (ptr < lim && sword--)
- {
- - while (ptr < lim && *ptr != tab)
- + while (ptr < lim && *ptr != tab[0])
- ++ptr;
- if (ptr < lim)
- ++ptr;
- @@ -1650,12 +1799,71 @@ begfield (struct line const *line, struct keyfield const *key)
- return ptr;
- }
- +#if HAVE_MBRTOWC
- +static char *
- +begfield_mb (const struct line *line, const struct keyfield *key)
- +{
- + int i;
- + char *ptr = line->text, *lim = ptr + line->length - 1;
- + size_t sword = key->sword;
- + size_t schar = key->schar;
- + size_t mblength;
- + mbstate_t state;
- +
- + memset (&state, '\0', sizeof(mbstate_t));
- +
- + if (tab_length)
- + while (ptr < lim && sword--)
- + {
- + while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
- + {
- + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
- + ptr += mblength;
- + }
- + if (ptr < lim)
- + {
- + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
- + ptr += mblength;
- + }
- + }
- + else
- + while (ptr < lim && sword--)
- + {
- + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
- + ptr += mblength;
- + if (ptr < lim)
- + {
- + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
- + ptr += mblength;
- + }
- + while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
- + ptr += mblength;
- + }
- +
- + if (key->skipsblanks)
- + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
- + ptr += mblength;
- +
- + for (i = 0; i < schar; i++)
- + {
- + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
- +
- + if (ptr + mblength > lim)
- + break;
- + else
- + ptr += mblength;
- + }
- +
- + return ptr;
- +}
- +#endif
- +
- /* Return the limit of (a pointer to the first character after) the field
- in LINE specified by KEY. */
- ATTRIBUTE_PURE
- static char *
- -limfield (struct line const *line, struct keyfield const *key)
- +limfield_uni (struct line const *line, struct keyfield const *key)
- {
- char *ptr = line->text, *lim = ptr + line->length - 1;
- size_t eword = key->eword, echar = key->echar;
- @@ -1670,10 +1878,10 @@ limfield (struct line const *line, struct keyfield const *key)
- 'beginning' is the first character following the delimiting TAB.
- Otherwise, leave PTR pointing at the first 'blank' character after
- the preceding field. */
- - if (tab != TAB_DEFAULT)
- + if (tab_length)
- while (ptr < lim && eword--)
- {
- - while (ptr < lim && *ptr != tab)
- + while (ptr < lim && *ptr != tab[0])
- ++ptr;
- if (ptr < lim && (eword || echar))
- ++ptr;
- @@ -1719,10 +1927,10 @@ limfield (struct line const *line, struct keyfield const *key)
- */
- /* Make LIM point to the end of (one byte past) the current field. */
- - if (tab != TAB_DEFAULT)
- + if (tab_length)
- {
- char *newlim;
- - newlim = memchr (ptr, tab, lim - ptr);
- + newlim = memchr (ptr, tab[0], lim - ptr);
- if (newlim)
- lim = newlim;
- }
- @@ -1753,6 +1961,130 @@ limfield (struct line const *line, struct keyfield const *key)
- return ptr;
- }
- +#if HAVE_MBRTOWC
- +static char * _GL_ATTRIBUTE_PURE
- +limfield_mb (const struct line *line, const struct keyfield *key)
- +{
- + char *ptr = line->text, *lim = ptr + line->length - 1;
- + size_t eword = key->eword, echar = key->echar;
- + int i;
- + size_t mblength;
- + mbstate_t state;
- +
- + if (echar == 0)
- + eword++; /* skip all of end field. */
- +
- + memset (&state, '\0', sizeof(mbstate_t));
- +
- + if (tab_length)
- + while (ptr < lim && eword--)
- + {
- + while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
- + {
- + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
- + ptr += mblength;
- + }
- + if (ptr < lim && (eword | echar))
- + {
- + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
- + ptr += mblength;
- + }
- + }
- + else
- + while (ptr < lim && eword--)
- + {
- + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
- + ptr += mblength;
- + if (ptr < lim)
- + {
- + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
- + ptr += mblength;
- + }
- + while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
- + ptr += mblength;
- + }
- +
- +
- +# ifdef POSIX_UNSPECIFIED
- + /* Make LIM point to the end of (one byte past) the current field. */
- + if (tab_length)
- + {
- + char *newlim, *p;
- +
- + newlim = NULL;
- + for (p = ptr; p < lim;)
- + {
- + if (memcmp (p, tab, tab_length) == 0)
- + {
- + newlim = p;
- + break;
- + }
- +
- + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
- + p += mblength;
- + }
- + }
- + else
- + {
- + char *newlim;
- + newlim = ptr;
- +
- + while (newlim < lim && ismbblank (newlim, lim - newlim, &mblength))
- + newlim += mblength;
- + if (ptr < lim)
- + {
- + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
- + ptr += mblength;
- + }
- + while (newlim < lim && !ismbblank (newlim, lim - newlim, &mblength))
- + newlim += mblength;
- + lim = newlim;
- + }
- +# endif
- +
- + if (echar != 0)
- + {
- + /* If we're skipping leading blanks, don't start counting characters
- + * until after skipping past any leading blanks. */
- + if (key->skipeblanks)
- + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
- + ptr += mblength;
- +
- + memset (&state, '\0', sizeof(mbstate_t));
- +
- + /* Advance PTR by ECHAR (if possible), but no further than LIM. */
- + for (i = 0; i < echar; i++)
- + {
- + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
- +
- + if (ptr + mblength > lim)
- + break;
- + else
- + ptr += mblength;
- + }
- + }
- +
- + return ptr;
- +}
- +#endif
- +
- +static void
- +skipblanks_uni (char **ptr, char *lim)
- +{
- + while (*ptr < lim && blanks[to_uchar (**ptr)])
- + ++(*ptr);
- +}
- +
- +#if HAVE_MBRTOWC
- +static void
- +skipblanks_mb (char **ptr, char *lim)
- +{
- + size_t mblength;
- + while (*ptr < lim && ismbblank (*ptr, lim - *ptr, &mblength))
- + (*ptr) += mblength;
- +}
- +#endif
- +
- /* Fill BUF reading from FP, moving buf->left bytes from the end
- of buf->buf to the beginning first. If EOF is reached and the
- file wasn't terminated by a newline, supply one. Set up BUF's line
- @@ -1839,8 +2171,22 @@ fillbuf (struct buffer *buf, FILE *fp, char const *file)
- else
- {
- if (key->skipsblanks)
- - while (blanks[to_uchar (*line_start)])
- - line_start++;
- + {
- +#if HAVE_MBRTOWC
- + if (MB_CUR_MAX > 1)
- + {
- + size_t mblength;
- + while (line_start < line->keylim &&
- + ismbblank (line_start,
- + line->keylim - line_start,
- + &mblength))
- + line_start += mblength;
- + }
- + else
- +#endif
- + while (blanks[to_uchar (*line_start)])
- + line_start++;
- + }
- line->keybeg = line_start;
- }
- }
- @@ -1976,12 +2322,10 @@ find_unit_order (char const *number)
- ATTRIBUTE_PURE
- static int
- -human_numcompare (char const *a, char const *b)
- +human_numcompare (char *a, char *b)
- {
- - while (blanks[to_uchar (*a)])
- - a++;
- - while (blanks[to_uchar (*b)])
- - b++;
- + skipblanks(&a, a + strlen(a));
- + skipblanks(&b, b + strlen(b));
- int diff = find_unit_order (a) - find_unit_order (b);
- return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
- @@ -1993,7 +2337,7 @@ human_numcompare (char const *a, char const *b)
- ATTRIBUTE_PURE
- static int
- -numcompare (char const *a, char const *b)
- +numcompare_uni (const char *a, const char *b)
- {
- while (blanks[to_uchar (*a)])
- a++;
- @@ -2003,6 +2347,25 @@ numcompare (char const *a, char const *b)
- return strnumcmp (a, b, decimal_point, thousands_sep);
- }
- +#if HAVE_MBRTOWC
- +static int
- +numcompare_mb (const char *a, const char *b)
- +{
- + size_t mblength, len;
- + len = strlen (a); /* okay for UTF-8 */
- + while (*a && ismbblank (a, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
- + {
- + a += mblength;
- + len -= mblength;
- + }
- + len = strlen (b); /* okay for UTF-8 */
- + while (*b && ismbblank (b, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
- + b += mblength;
- +
- + return strnumcmp (a, b, decimal_point, thousands_sep);
- +}
- +#endif /* HAV_EMBRTOWC */
- +
- /* Work around a problem whereby the long double value returned by glibc's
- strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
- A and B before calling strtold. FIXME: remove this function if
- @@ -2053,7 +2416,7 @@ general_numcompare (char const *sa, char const *sb)
- Return 0 if the name in S is not recognized. */
- static int
- -getmonth (char const *month, char **ea)
- +getmonth_uni (char const *month, size_t len, char **ea)
- {
- size_t lo = 0;
- size_t hi = MONTHS_PER_YEAR;
- @@ -2329,15 +2692,14 @@ debug_key (struct line const *line, struct keyfield const *key)
- char saved = *lim;
- *lim = '\0';
- - while (blanks[to_uchar (*beg)])
- - beg++;
- + skipblanks (&beg, lim);
- char *tighter_lim = beg;
- if (lim < beg)
- tighter_lim = lim;
- else if (key->month)
- - getmonth (beg, &tighter_lim);
- + getmonth (beg, lim-beg, &tighter_lim);
- else if (key->general_numeric)
- ignore_value (strtold (beg, &tighter_lim));
- else if (key->numeric || key->human_numeric)
- @@ -2483,7 +2845,7 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
- /* Warn about significant leading blanks. */
- bool implicit_skip = key_numeric (key) || key->month;
- bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
- - if (!zero_width && !gkey_only && tab == TAB_DEFAULT && !line_offset
- + if (!zero_width && !gkey_only && !tab_length && !line_offset
- && ((!key->skipsblanks && !implicit_skip)
- || (!key->skipsblanks && key->schar)
- || (!key->skipeblanks && key->echar)))
- @@ -2531,9 +2893,9 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
- bool number_locale_warned = false;
- if (basic_numeric_field_span)
- {
- - if (tab == TAB_DEFAULT
- - ? thousands_sep != NON_CHAR && (isblank (to_uchar (thousands_sep)))
- - : tab == thousands_sep)
- + if (tab_length
- + ? tab[0] == thousands_sep
- + : thousands_sep != NON_CHAR && (isblank (to_uchar (thousands_sep))))
- {
- error (0, 0,
- _("field separator %s is treated as a "
- @@ -2544,9 +2906,9 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
- }
- if (basic_numeric_field_span || general_numeric_field_span)
- {
- - if (tab == TAB_DEFAULT
- - ? thousands_sep != NON_CHAR && (isblank (to_uchar (decimal_point)))
- - : tab == decimal_point)
- + if (tab_length
- + ? tab[0] == decimal_point
- + : thousands_sep != NON_CHAR && (isblank (to_uchar (decimal_point))))
- {
- error (0, 0,
- _("field separator %s is treated as a "
- @@ -2554,19 +2916,19 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
- quote (((char []) {decimal_point, 0})));
- number_locale_warned = true;
- }
- - else if (tab == '-')
- + else if (tab_length && tab[0] == '-')
- {
- error (0, 0,
- _("field separator %s is treated as a "
- "minus sign in numbers"),
- - quote (((char []) {tab, 0})));
- + quote (((char []) {tab[0], 0})));
- }
- - else if (general_numeric_field_span && tab == '+')
- + else if (general_numeric_field_span && tab_length && tab[0] == '+')
- {
- error (0, 0,
- _("field separator %s is treated as a "
- "plus sign in numbers"),
- - quote (((char []) {tab, 0})));
- + quote (((char []) {tab[0], 0})));
- }
- }
- @@ -2577,7 +2939,7 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
- {
- error (0, 0,
- _("%snumbers use %s as a decimal point in this locale"),
- - tab == decimal_point ? "" : _("note "),
- + (tab_length && tab[0] == decimal_point) ? "" : _("note "),
- quote (((char []) {decimal_point, 0})));
- }
- @@ -2610,11 +2972,87 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
- error (0, 0, _("option '-r' only applies to last-resort comparison"));
- }
- +#if HAVE_MBRTOWC
- +static int
- +getmonth_mb (const char *s, size_t len, char **ea)
- +{
- + char *month;
- + register size_t i;
- + register int lo = 0, hi = MONTHS_PER_YEAR, result;
- + char *tmp;
- + size_t wclength, mblength;
- + const char *pp;
- + const wchar_t *wpp;
- + wchar_t *month_wcs;
- + mbstate_t state;
- +
- + while (len > 0 && ismbblank (s, len, &mblength))
- + {
- + s += mblength;
- + len -= mblength;
- + }
- +
- + if (len == 0)
- + return 0;
- +
- + if (SIZE_MAX - len < 1)
- + xalloc_die ();
- +
- + month = (char *) xnmalloc (len + 1, MB_CUR_MAX);
- +
- + pp = tmp = (char *) xnmalloc (len + 1, MB_CUR_MAX);
- + memcpy (tmp, s, len);
- + tmp[len] = '\0';
- + wpp = month_wcs = (wchar_t *) xnmalloc (len + 1, sizeof (wchar_t));
- + memset (&state, '\0', sizeof (mbstate_t));
- +
- + wclength = mbsrtowcs (month_wcs, &pp, len + 1, &state);
- + if (wclength == (size_t)-1 || pp != NULL)
- + error (SORT_FAILURE, 0, _("Invalid multibyte input %s."), quote(s));
- +
- + for (i = 0; i < wclength; i++)
- + {
- + month_wcs[i] = towupper(month_wcs[i]);
- + if (iswblank (month_wcs[i]))
- + {
- + month_wcs[i] = L'\0';
- + break;
- + }
- + }
- +
- + mblength = wcsrtombs (month, &wpp, (len + 1) * MB_CUR_MAX, &state);
- + assert (mblength != (-1) && wpp == NULL);
- +
- + do
- + {
- + int ix = (lo + hi) / 2;
- +
- + if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0)
- + hi = ix;
- + else
- + lo = ix;
- + }
- + while (hi - lo > 1);
- +
- + result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name))
- + ? monthtab[lo].val : 0);
- +
- + if (ea && result)
- + *ea = (char*) s + strlen (monthtab[lo].name);
- +
- + free (month);
- + free (tmp);
- + free (month_wcs);
- +
- + return result;
- +}
- +#endif
- +
- /* Compare two lines A and B trying every key in sequence until there
- are no more keys or a difference is found. */
- static int
- -keycompare (struct line const *a, struct line const *b)
- +keycompare_uni (const struct line *a, const struct line *b)
- {
- struct keyfield *key = keylist;
- @@ -2699,7 +3137,7 @@ keycompare (struct line const *a, struct line const *b)
- else if (key->human_numeric)
- diff = human_numcompare (ta, tb);
- else if (key->month)
- - diff = getmonth (ta, NULL) - getmonth (tb, NULL);
- + diff = getmonth (ta, tlena, NULL) - getmonth (tb, tlenb, NULL);
- else if (key->random)
- diff = compare_random (ta, tlena, tb, tlenb);
- else if (key->version)
- @@ -2815,6 +3253,211 @@ keycompare (struct line const *a, struct line const *b)
- return key->reverse ? -diff : diff;
- }
- +#if HAVE_MBRTOWC
- +static int
- +keycompare_mb (const struct line *a, const struct line *b)
- +{
- + struct keyfield *key = keylist;
- +
- + /* For the first iteration only, the key positions have been
- + precomputed for us. */
- + char *texta = a->keybeg;
- + char *textb = b->keybeg;
- + char *lima = a->keylim;
- + char *limb = b->keylim;
- +
- + size_t mblength_a, mblength_b;
- + wchar_t wc_a, wc_b;
- + mbstate_t state_a, state_b;
- +
- + int diff = 0;
- +
- + memset (&state_a, '\0', sizeof(mbstate_t));
- + memset (&state_b, '\0', sizeof(mbstate_t));
- + /* Ignore keys with start after end. */
- + if (a->keybeg - a->keylim > 0)
- + return 0;
- +
- +
- + /* Ignore and/or translate chars before comparing. */
- +# define IGNORE_CHARS(NEW_LEN, LEN, TEXT, COPY, WC, MBLENGTH, STATE) \
- + do \
- + { \
- + wchar_t uwc; \
- + char mbc[MB_LEN_MAX]; \
- + mbstate_t state_wc; \
- + \
- + for (NEW_LEN = i = 0; i < LEN;) \
- + { \
- + mbstate_t state_bak; \
- + \
- + state_bak = STATE; \
- + MBLENGTH = mbrtowc (&WC, TEXT + i, LEN - i, &STATE); \
- + \
- + if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1 \
- + || MBLENGTH == 0) \
- + { \
- + if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1) \
- + STATE = state_bak; \
- + if (!ignore) \
- + COPY[NEW_LEN++] = TEXT[i]; \
- + i++; \
- + continue; \
- + } \
- + \
- + if (ignore) \
- + { \
- + if ((ignore == nonprinting && !iswprint (WC)) \
- + || (ignore == nondictionary \
- + && !iswalnum (WC) && !iswblank (WC))) \
- + { \
- + i += MBLENGTH; \
- + continue; \
- + } \
- + } \
- + \
- + if (translate) \
- + { \
- + \
- + uwc = towupper(WC); \
- + if (WC == uwc) \
- + { \
- + memcpy (mbc, TEXT + i, MBLENGTH); \
- + i += MBLENGTH; \
- + } \
- + else \
- + { \
- + i += MBLENGTH; \
- + WC = uwc; \
- + memset (&state_wc, '\0', sizeof (mbstate_t)); \
- + \
- + MBLENGTH = wcrtomb (mbc, WC, &state_wc); \
- + assert (MBLENGTH != (size_t)-1 && MBLENGTH != 0); \
- + } \
- + \
- + for (j = 0; j < MBLENGTH; j++) \
- + COPY[NEW_LEN++] = mbc[j]; \
- + } \
- + else \
- + for (j = 0; j < MBLENGTH; j++) \
- + COPY[NEW_LEN++] = TEXT[i++]; \
- + } \
- + COPY[NEW_LEN] = '\0'; \
- + } \
- + while (0)
- +
- + /* Actually compare the fields. */
- +
- + for (;;)
- + {
- + /* Find the lengths. */
- + size_t lena = lima <= texta ? 0 : lima - texta;
- + size_t lenb = limb <= textb ? 0 : limb - textb;
- +
- + char enda IF_LINT (= 0);
- + char endb IF_LINT (= 0);
- +
- + char const *translate = key->translate;
- + bool const *ignore = key->ignore;
- +
- + if (ignore || translate)
- + {
- + if (SIZE_MAX - lenb - 2 < lena)
- + xalloc_die ();
- + char *copy_a = (char *) xnmalloc (lena + lenb + 2, MB_CUR_MAX);
- + char *copy_b = copy_a + lena * MB_CUR_MAX + 1;
- + size_t new_len_a, new_len_b;
- + size_t i, j;
- +
- + IGNORE_CHARS (new_len_a, lena, texta, copy_a,
- + wc_a, mblength_a, state_a);
- + IGNORE_CHARS (new_len_b, lenb, textb, copy_b,
- + wc_b, mblength_b, state_b);
- + texta = copy_a; textb = copy_b;
- + lena = new_len_a; lenb = new_len_b;
- + }
- + else
- + {
- + /* Use the keys in-place, temporarily null-terminated. */
- + enda = texta[lena]; texta[lena] = '\0';
- + endb = textb[lenb]; textb[lenb] = '\0';
- + }
- +
- + if (key->random)
- + diff = compare_random (texta, lena, textb, lenb);
- + else if (key->numeric | key->general_numeric | key->human_numeric)
- + {
- + char savea = *lima, saveb = *limb;
- +
- + *lima = *limb = '\0';
- + diff = (key->numeric ? numcompare (texta, textb)
- + : key->general_numeric ? general_numcompare (texta, textb)
- + : human_numcompare (texta, textb));
- + *lima = savea, *limb = saveb;
- + }
- + else if (key->version)
- + diff = filevercmp (texta, textb);
- + else if (key->month)
- + diff = getmonth (texta, lena, NULL) - getmonth (textb, lenb, NULL);
- + else if (lena == 0)
- + diff = - NONZERO (lenb);
- + else if (lenb == 0)
- + diff = 1;
- + else if (hard_LC_COLLATE && !folding)
- + {
- + diff = xmemcoll0 (texta, lena + 1, textb, lenb + 1);
- + }
- + else
- + {
- + diff = memcmp (texta, textb, MIN (lena, lenb));
- + if (diff == 0)
- + diff = lena < lenb ? -1 : lena != lenb;
- + }
- +
- + if (ignore || translate)
- + free (texta);
- + else
- + {
- + texta[lena] = enda;
- + textb[lenb] = endb;
- + }
- +
- + if (diff)
- + goto not_equal;
- +
- + key = key->next;
- + if (! key)
- + break;
- +
- + /* Find the beginning and limit of the next field. */
- + if (key->eword != -1)
- + lima = limfield (a, key), limb = limfield (b, key);
- + else
- + lima = a->text + a->length - 1, limb = b->text + b->length - 1;
- +
- + if (key->sword != -1)
- + texta = begfield (a, key), textb = begfield (b, key);
- + else
- + {
- + texta = a->text, textb = b->text;
- + if (key->skipsblanks)
- + {
- + while (texta < lima && ismbblank (texta, lima - texta, &mblength_a))
- + texta += mblength_a;
- + while (textb < limb && ismbblank (textb, limb - textb, &mblength_b))
- + textb += mblength_b;
- + }
- + }
- + }
- +
- +not_equal:
- + if (key && key->reverse)
- + return -diff;
- + else
- + return diff;
- +}
- +#endif
- +
- /* Compare two lines A and B, returning negative, zero, or positive
- depending on whether A compares less than, equal to, or greater than B. */
- @@ -2842,7 +3485,7 @@ compare (struct line const *a, struct line const *b)
- diff = - NONZERO (blen);
- else if (blen == 0)
- diff = 1;
- - else if (hard_LC_COLLATE)
- + else if (hard_LC_COLLATE && !folding)
- {
- /* xmemcoll0 is a performance enhancement as
- it will not unconditionally write '\0' after the
- @@ -4226,6 +4869,7 @@ set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
- break;
- case 'f':
- key->translate = fold_toupper;
- + folding = true;
- break;
- case 'g':
- key->general_numeric = true;
- @@ -4305,7 +4949,7 @@ main (int argc, char **argv)
- initialize_exit_failure (SORT_FAILURE);
- hard_LC_COLLATE = hard_locale (LC_COLLATE);
- -#if HAVE_NL_LANGINFO
- +#if HAVE_LANGINFO_CODESET
- hard_LC_TIME = hard_locale (LC_TIME);
- #endif
- @@ -4328,6 +4972,29 @@ main (int argc, char **argv)
- thousands_sep = NON_CHAR;
- }
- +#if HAVE_MBRTOWC
- + if (MB_CUR_MAX > 1)
- + {
- + inittables = inittables_mb;
- + begfield = begfield_mb;
- + limfield = limfield_mb;
- + skipblanks = skipblanks_mb;
- + getmonth = getmonth_mb;
- + keycompare = keycompare_mb;
- + numcompare = numcompare_mb;
- + }
- + else
- +#endif
- + {
- + inittables = inittables_uni;
- + begfield = begfield_uni;
- + limfield = limfield_uni;
- + skipblanks = skipblanks_uni;
- + getmonth = getmonth_uni;
- + keycompare = keycompare_uni;
- + numcompare = numcompare_uni;
- + }
- +
- have_read_stdin = false;
- inittables ();
- @@ -4602,13 +5269,34 @@ main (int argc, char **argv)
- case 't':
- {
- - char newtab = optarg[0];
- - if (! newtab)
- + char newtab[MB_LEN_MAX + 1];
- + size_t newtab_length = 1;
- + strncpy (newtab, optarg, MB_LEN_MAX);
- + if (! newtab[0])
- die (SORT_FAILURE, 0, _("empty tab"));
- - if (optarg[1])
- +#if HAVE_MBRTOWC
- + if (MB_CUR_MAX > 1)
- + {
- + wchar_t wc;
- + mbstate_t state;
- +
- + memset (&state, '\0', sizeof (mbstate_t));
- + newtab_length = mbrtowc (&wc, newtab, strnlen (newtab,
- + MB_LEN_MAX),
- + &state);
- + switch (newtab_length)
- + {
- + case (size_t) -1:
- + case (size_t) -2:
- + case 0:
- + newtab_length = 1;
- + }
- + }
- +#endif
- + if (newtab_length == 1 && optarg[1])
- {
- if (STREQ (optarg, "\\0"))
- - newtab = '\0';
- + newtab[0] = '\0';
- else
- {
- /* Provoke with 'sort -txx'. Complain about
- @@ -4619,9 +5307,11 @@ main (int argc, char **argv)
- quote (optarg));
- }
- }
- - if (tab != TAB_DEFAULT && tab != newtab)
- + if (tab_length && (tab_length != newtab_length
- + || memcmp (tab, newtab, tab_length) != 0))
- die (SORT_FAILURE, 0, _("incompatible tabs"));
- - tab = newtab;
- + memcpy (tab, newtab, newtab_length);
- + tab_length = newtab_length;
- }
- break;
- diff --git a/src/unexpand.c b/src/unexpand.c
- index 7d6100f..04cd646 100644
- --- a/src/unexpand.c
- +++ b/src/unexpand.c
- @@ -38,6 +38,9 @@
- #include <stdio.h>
- #include <getopt.h>
- #include <sys/types.h>
- +
- +#include <mbfile.h>
- +
- #include "system.h"
- #include "die.h"
- @@ -106,24 +109,47 @@ unexpand (void)
- {
- /* Input stream. */
- FILE *fp = next_file (NULL);
- + mb_file_t mbf;
- /* The array of pending blanks. In non-POSIX locales, blanks can
- include characters other than spaces, so the blanks must be
- stored, not merely counted. */
- - char *pending_blank;
- + mbf_char_t *pending_blank;
- + /* True if the starting locale is utf8. */
- + bool using_utf_locale;
- +
- + /* True if the first file contains BOM header. */
- + bool found_bom;
- + using_utf_locale=check_utf_locale();
- if (!fp)
- return;
- + mbf_init (mbf, fp);
- + found_bom=check_bom(fp,&mbf);
- +
- + if (using_utf_locale == false && found_bom == true)
- + {
- + /*try using some predefined locale */
- + if (set_utf_locale () != 0)
- + {
- + error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale"));
- + }
- + }
- /* The worst case is a non-blank character, then one blank, then a
- tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so
- allocate MAX_COLUMN_WIDTH bytes to store the blanks. */
- - pending_blank = xmalloc (max_column_width);
- + pending_blank = xmalloc (max_column_width * sizeof (mbf_char_t));
- +
- + if (found_bom == true)
- + {
- + print_bom();
- + }
- while (true)
- {
- /* Input character, or EOF. */
- - int c;
- + mbf_char_t c;
- /* If true, perform translations. */
- bool convert = true;
- @@ -157,12 +183,44 @@ unexpand (void)
- do
- {
- - while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
- - continue;
- + while (true) {
- + mbf_getc (c, mbf);
- + if ((mb_iseof (c)) && (fp = next_file (fp)))
- + {
- + mbf_init (mbf, fp);
- + if (fp!=NULL)
- + {
- + if (check_bom(fp,&mbf)==true)
- + {
- + /*Not the first file - check BOM header*/
- + if (using_utf_locale==false && found_bom==false)
- + {
- + /*BOM header in subsequent file but not in the first one. */
- + error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
- + }
- + }
- + else
- + {
- + if(using_utf_locale==false && found_bom==true)
- + {
- + /*First file conatined BOM header - locale was switched to UTF
- + *all subsequent files should contain BOM. */
- + error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
- + }
- + }
- + }
- + continue;
- + }
- + else
- + {
- + break;
- + }
- + }
- +
- if (convert)
- {
- - bool blank = !! isblank (c);
- + bool blank = mb_isblank (c);
- if (blank)
- {
- @@ -179,16 +237,16 @@ unexpand (void)
- if (next_tab_column < column)
- die (EXIT_FAILURE, 0, _("input line is too long"));
- - if (c == '\t')
- + if (mb_iseq (c, '\t'))
- {
- column = next_tab_column;
- if (pending)
- - pending_blank[0] = '\t';
- + mb_setascii (&pending_blank[0], '\t');
- }
- else
- {
- - column++;
- + column += mb_width (c);
- if (! (prev_blank && column == next_tab_column))
- {
- @@ -196,13 +254,14 @@ unexpand (void)
- will be replaced by tabs. */
- if (column == next_tab_column)
- one_blank_before_tab_stop = true;
- - pending_blank[pending++] = c;
- + mb_copy (&pending_blank[pending++], &c);
- prev_blank = true;
- continue;
- }
- /* Replace the pending blanks by a tab or two. */
- - pending_blank[0] = c = '\t';
- + mb_setascii (&c, '\t');
- + mb_setascii (&pending_blank[0], '\t');
- }
- /* Discard pending blanks, unless it was a single
- @@ -210,7 +269,7 @@ unexpand (void)
- pending = one_blank_before_tab_stop;
- }
- }
- - else if (c == '\b')
- + else if (mb_iseq (c, '\b'))
- {
- /* Go back one column, and force recalculation of the
- next tab stop. */
- @@ -218,9 +277,11 @@ unexpand (void)
- next_tab_column = column;
- tab_index -= !!tab_index;
- }
- - else
- + else if (!mb_iseq (c, '\n'))
- {
- - column++;
- + /* mb_width() returns 0 for control characters */
- + const int width = mb_width (c);
- + column += MAX(1, width);
- if (!column)
- die (EXIT_FAILURE, 0, _("input line is too long"));
- }
- @@ -228,8 +289,11 @@ unexpand (void)
- if (pending)
- {
- if (pending > 1 && one_blank_before_tab_stop)
- - pending_blank[0] = '\t';
- - if (fwrite (pending_blank, 1, pending, stdout) != pending)
- + mb_setascii (&pending_blank[0], '\t');
- +
- + for (int n = 0; n < pending; ++n)
- + mb_putc (pending_blank[n], stdout);
- + if (ferror (stdout))
- die (EXIT_FAILURE, errno, _("write error"));
- pending = 0;
- one_blank_before_tab_stop = false;
- @@ -239,16 +303,17 @@ unexpand (void)
- convert &= convert_entire_line || blank;
- }
- - if (c < 0)
- + if (mb_iseof (c))
- {
- free (pending_blank);
- return;
- }
- - if (putchar (c) < 0)
- + mb_putc (c, stdout);
- + if (ferror (stdout))
- die (EXIT_FAILURE, errno, _("write error"));
- }
- - while (c != '\n');
- + while (!mb_iseq (c, '\n'));
- }
- }
- diff --git a/src/uniq.c b/src/uniq.c
- index e5996f0..871d47c 100644
- --- a/src/uniq.c
- +++ b/src/uniq.c
- @@ -21,6 +21,17 @@
- #include <getopt.h>
- #include <sys/types.h>
- +/* Get mbstate_t, mbrtowc(). */
- +#if HAVE_WCHAR_H
- +# include <wchar.h>
- +#endif
- +
- +/* Get isw* functions. */
- +#if HAVE_WCTYPE_H
- +# include <wctype.h>
- +#endif
- +#include <assert.h>
- +
- #include "system.h"
- #include "argmatch.h"
- #include "linebuffer.h"
- @@ -33,6 +44,18 @@
- #include "memcasecmp.h"
- #include "quote.h"
- +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
- + installation; work around this configuration error. */
- +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
- +# define MB_LEN_MAX 16
- +#endif
- +
- +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
- +#if HAVE_MBRTOWC && defined mbstate_t
- +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
- +#endif
- +
- +
- /* The official name of this program (e.g., no 'g' prefix). */
- #define PROGRAM_NAME "uniq"
- @@ -139,6 +162,10 @@ enum
- GROUP_OPTION = CHAR_MAX + 1
- };
- +/* Function pointers. */
- +static char *
- +(*find_field) (struct linebuffer *line);
- +
- static struct option const longopts[] =
- {
- {"count", no_argument, NULL, 'c'},
- @@ -254,7 +281,7 @@ size_opt (char const *opt, char const *msgid)
- ATTRIBUTE_PURE
- static char *
- -find_field (struct linebuffer const *line)
- +find_field_uni (struct linebuffer *line)
- {
- size_t count;
- char const *lp = line->buffer;
- @@ -274,6 +301,83 @@ find_field (struct linebuffer const *line)
- return line->buffer + i;
- }
- +#if HAVE_MBRTOWC
- +
- +# define MBCHAR_TO_WCHAR(WC, MBLENGTH, LP, POS, SIZE, STATEP, CONVFAIL) \
- + do \
- + { \
- + mbstate_t state_bak; \
- + \
- + CONVFAIL = 0; \
- + state_bak = *STATEP; \
- + \
- + MBLENGTH = mbrtowc (&WC, LP + POS, SIZE - POS, STATEP); \
- + \
- + switch (MBLENGTH) \
- + { \
- + case (size_t)-2: \
- + case (size_t)-1: \
- + *STATEP = state_bak; \
- + CONVFAIL++; \
- + /* Fall through */ \
- + case 0: \
- + MBLENGTH = 1; \
- + } \
- + } \
- + while (0)
- +
- +static char *
- +find_field_multi (struct linebuffer *line)
- +{
- + size_t count;
- + char *lp = line->buffer;
- + size_t size = line->length - 1;
- + size_t pos;
- + size_t mblength;
- + wchar_t wc;
- + mbstate_t *statep;
- + int convfail = 0;
- +
- + pos = 0;
- + statep = &(line->state);
- +
- + /* skip fields. */
- + for (count = 0; count < skip_fields && pos < size; count++)
- + {
- + while (pos < size)
- + {
- + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
- +
- + if (convfail || !(iswblank (wc) || wc == '\n'))
- + {
- + pos += mblength;
- + break;
- + }
- + pos += mblength;
- + }
- +
- + while (pos < size)
- + {
- + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
- +
- + if (!convfail && (iswblank (wc) || wc == '\n'))
- + break;
- +
- + pos += mblength;
- + }
- + }
- +
- + /* skip fields. */
- + for (count = 0; count < skip_chars && pos < size; count++)
- + {
- + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
- + pos += mblength;
- + }
- +
- + return lp + pos;
- +}
- +#endif
- +
- /* Return false if two strings OLD and NEW match, true if not.
- OLD and NEW point not to the beginnings of the lines
- but rather to the beginnings of the fields to compare.
- @@ -494,6 +598,19 @@ main (int argc, char **argv)
- atexit (close_stdout);
- +#if HAVE_MBRTOWC
- + if (MB_CUR_MAX > 1)
- + {
- + find_field = find_field_multi;
- + }
- + else
- +#endif
- + {
- + find_field = find_field_uni;
- + }
- +
- +
- +
- skip_chars = 0;
- skip_fields = 0;
- check_chars = SIZE_MAX;
- diff --git a/tests/Coreutils.pm b/tests/Coreutils.pm
- index fad7ab9..c9021a6 100644
- --- a/tests/Coreutils.pm
- +++ b/tests/Coreutils.pm
- @@ -269,6 +269,9 @@ sub run_tests ($$$$$)
- # Yes, this is an arbitrary limit. If it causes trouble,
- # consider removing it.
- my $max = 30;
- + # The downstream i18n multi-byte tests have a "-mb" suffix.
- + # Therefore add 3 to the maximum test name length.
- + $max += 3;
- if ($max < length $test_name)
- {
- warn "$program_name: $test_name: test name is too long (> $max)\n";
- diff --git a/tests/expand/mb.sh b/tests/expand/mb.sh
- new file mode 100755
- index 0000000..dd6007c
- --- /dev/null
- +++ b/tests/expand/mb.sh
- @@ -0,0 +1,183 @@
- +#!/bin/sh
- +
- +# Copyright (C) 2012-2015 Free Software Foundation, Inc.
- +
- +# This program is free software: you can redistribute it and/or modify
- +# it under the terms of the GNU General Public License as published by
- +# the Free Software Foundation, either version 3 of the License, or
- +# (at your option) any later version.
- +
- +# This program is distributed in the hope that it will be useful,
- +# but WITHOUT ANY WARRANTY; without even the implied warranty of
- +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- +# GNU General Public License for more details.
- +
- +# You should have received a copy of the GNU General Public License
- +# along with this program. If not, see <http://www.gnu.org/licenses/>.
- +
- +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
- +print_ver_ expand
- +
- +export LC_ALL=en_US.UTF-8
- +
- +#input containing multibyte characters
- +cat <<\EOF > in || framework_failure_
- +1234567812345678123456781
- +. . . .
- +a b c d
- +. . . .
- +ä ö ü ß
- +. . . .
- +EOF
- +env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_
- +
- +cat <<\EOF > exp || framework_failure_
- +1234567812345678123456781
- +. . . .
- +a b c d
- +. . . .
- +ä ö ü ß
- +. . . .
- + äöü . öüä. ä xx
- +EOF
- +
- +expand < in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +#multiple files as an input
- +cat <<\EOF >> exp || framework_failure_
- +1234567812345678123456781
- +. . . .
- +a b c d
- +. . . .
- +ä ö ü ß
- +. . . .
- + äöü . öüä. ä xx
- +EOF
- +
- +expand ./in ./in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +#test characters with display widths != 1
- +env printf '12345678
- +e\t|ascii(1)
- +\u00E9\t|composed(1)
- +e\u0301\t|decomposed(1)
- +\u3000\t|ideo-space(2)
- +\uFF0D\t|full-hypen(2)
- +' > in || framework_failure_
- +
- +env printf '12345678
- +e |ascii(1)
- +\u00E9 |composed(1)
- +e\u0301 |decomposed(1)
- +\u3000 |ideo-space(2)
- +\uFF0D |full-hypen(2)
- +' > exp || framework_failure_
- +
- +expand < in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +#shouldn't fail with "input line too long"
- +#when a line starts with a control character
- +env printf '\n' > in || framework_failure_
- +
- +expand < in > out || fail=1
- +compare in out > /dev/null 2>&1 || fail=1
- +
- +#non-Unicode characters interspersed between Unicode ones
- +env printf '12345678
- +\t\xFF|
- +\xFF\t|
- +\t\xFFä|
- +ä\xFF\t|
- +\tä\xFF|
- +\xFF\tä|
- +äbcdef\xFF\t|
- +' > in || framework_failure_
- +
- +env printf '12345678
- + \xFF|
- +\xFF |
- + \xFFä|
- +ä\xFF |
- + ä\xFF|
- +\xFF ä|
- +äbcdef\xFF |
- +' > exp || framework_failure_
- +
- +expand < in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +
- +
- +#BOM header test 1
- +printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_
- +1234567812345678123456781
- +. . . .
- +a b c d
- +. . . .
- +ä ö ü ß
- +. . . .
- +EOF
- +env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_
- +
- +printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
- +1234567812345678123456781
- +. . . .
- +a b c d
- +. . . .
- +ä ö ü ß
- +. . . .
- + äöü . öüä. ä xx
- +EOF
- +
- +
- +expand < in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +LANG=C expand < in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +LC_ALL=C expand < in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +
- +printf '\xEF\xBB\xBF' > in1; cat <<\EOF >> in1 || framework_failure_
- +1234567812345678123456781
- +. . . .
- +a b c d
- +. . . .
- +ä ö ü ß
- +. . . .
- +EOF
- +env printf ' äöü\t. öüä. \tä xx\n' >> in1 || framework_failure_
- +
- +
- +printf '\xEF\xBB\xBF' > exp; cat <<\EOF >> exp || framework_failure_
- +1234567812345678123456781
- +. . . .
- +a b c d
- +. . . .
- +ä ö ü ß
- +. . . .
- + äöü . öüä. ä xx
- +1234567812345678123456781
- +. . . .
- +a b c d
- +. . . .
- +ä ö ü ß
- +. . . .
- + äöü . öüä. ä xx
- +EOF
- +
- +expand in1 in1 > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +LANG=C expand in1 in1 > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +LC_ALL=C expand in1 in1 > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +exit $fail
- diff --git a/tests/i18n/sort.sh b/tests/i18n/sort.sh
- new file mode 100755
- index 0000000..26c95de
- --- /dev/null
- +++ b/tests/i18n/sort.sh
- @@ -0,0 +1,29 @@
- +#!/bin/sh
- +# Verify sort's multi-byte support.
- +
- +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
- +print_ver_ sort
- +
- +export LC_ALL=en_US.UTF-8
- +locale -k LC_CTYPE | grep -q "charmap.*UTF-8" \
- + || skip_ "No UTF-8 locale available"
- +
- +# Enable heap consistency checkng on older systems
- +export MALLOC_CHECK_=2
- +
- +
- +# check buffer overflow issue due to
- +# expanding multi-byte representation due to case conversion
- +# https://bugzilla.suse.com/show_bug.cgi?id=928749
- +cat <<EOF > exp
- +.
- +ɑ
- +EOF
- +cat <<EOF | sort -f > out || fail=1
- +.
- +ɑ
- +EOF
- +compare exp out || { fail=1; cat out; }
- +
- +
- +Exit $fail
- diff --git a/tests/local.mk b/tests/local.mk
- index 0f77786..dbe1843 100644
- --- a/tests/local.mk
- +++ b/tests/local.mk
- @@ -377,6 +377,8 @@ all_tests = \
- tests/misc/sort-discrim.sh \
- tests/misc/sort-files0-from.pl \
- tests/misc/sort-float.sh \
- + tests/misc/sort-mb-tests.sh \
- + tests/i18n/sort.sh \
- tests/misc/sort-h-thousands-sep.sh \
- tests/misc/sort-merge.pl \
- tests/misc/sort-merge-fdlimit.sh \
- @@ -576,6 +578,7 @@ all_tests = \
- tests/du/threshold.sh \
- tests/du/trailing-slash.sh \
- tests/du/two-args.sh \
- + tests/expand/mb.sh \
- tests/id/gnu-zero-uids.sh \
- tests/id/no-context.sh \
- tests/id/context.sh \
- @@ -727,6 +730,7 @@ all_tests = \
- tests/touch/read-only.sh \
- tests/touch/relative.sh \
- tests/touch/trailing-slash.sh \
- + tests/unexpand/mb.sh \
- $(all_root_tests)
- # See tests/factor/create-test.sh.
- diff --git a/tests/misc/expand.pl b/tests/misc/expand.pl
- index 7a77e6f..27f6652 100755
- --- a/tests/misc/expand.pl
- +++ b/tests/misc/expand.pl
- @@ -27,6 +27,15 @@ my $prog = 'expand';
- # Turn off localization of executable's output.
- @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
- +#comment out next line to disable multibyte tests
- +my $mb_locale = $ENV{LOCALE_FR_UTF8};
- +! defined $mb_locale || $mb_locale eq 'none'
- + and $mb_locale = 'C';
- +
- +my $prog = 'expand';
- +my $try = "Try \`$prog --help' for more information.\n";
- +my $inval = "$prog: invalid byte, character or field list\n$try";
- +
- my @Tests =
- (
- ['t1', '--tabs=3', {IN=>"a\tb"}, {OUT=>"a b"}],
- @@ -168,6 +177,8 @@ my @Tests =
- # Test errors
- + # FIXME: The following tests contain ‘quoting’ specific to LC_MESSAGES
- + # So we force LC_MESSAGES=C to make them pass.
- ['e1', '--tabs="a"', {IN=>''}, {OUT=>''}, {EXIT=>1},
- {ERR => "$prog: tab size contains invalid character(s): 'a'\n"}],
- ['e2', "-t $UINTMAX_OFLOW", {IN=>''}, {OUT=>''}, {EXIT=>1},
- @@ -184,6 +195,37 @@ my @Tests =
- {ERR => "$prog: '/' specifier not at start of number: '/'\n"}],
- );
- +if ($mb_locale ne 'C')
- + {
- + # Duplicate each test vector, appending "-mb" to the test name and
- + # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
- + # provide coverage for the distro-added multi-byte code paths.
- + my @new;
- + foreach my $t (@Tests)
- + {
- + my @new_t = @$t;
- + my $test_name = shift @new_t;
- +
- + # Depending on whether expand is multi-byte-patched,
- + # it emits different diagnostics:
- + # non-MB: invalid byte or field list
- + # MB: invalid byte, character or field list
- + # Adjust the expected error output accordingly.
- + if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
- + (@new_t))
- + {
- + my $sub = {ERR_SUBST => 's/, character//'};
- + push @new_t, $sub;
- + push @$t, $sub;
- + }
- + push @new, ["$test_name-mb", @new_t, {ENV => "LANG=$mb_locale LC_MESSAGES=C"}];
- + }
- + push @Tests, @new;
- + }
- +
- +
- +@Tests = triple_test \@Tests;
- +
- my $save_temps = $ENV{DEBUG};
- my $verbose = $ENV{VERBOSE};
- diff --git a/tests/misc/fold.pl b/tests/misc/fold.pl
- index 2834f92..bc1616a 100755
- --- a/tests/misc/fold.pl
- +++ b/tests/misc/fold.pl
- @@ -20,9 +20,18 @@ use strict;
- (my $program_name = $0) =~ s|.*/||;
- +my $prog = 'fold';
- +my $try = "Try \`$prog --help' for more information.\n";
- +my $inval = "$prog: invalid byte, character or field list\n$try";
- +
- # Turn off localization of executable's output.
- @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
- +# uncommented to enable multibyte paths
- +my $mb_locale = $ENV{LOCALE_FR_UTF8};
- +! defined $mb_locale || $mb_locale eq 'none'
- + and $mb_locale = 'C';
- +
- my @Tests =
- (
- ['s1', '-w2 -s', {IN=>"a\t"}, {OUT=>"a\n\t"}],
- @@ -31,9 +40,48 @@ my @Tests =
- ['s4', '-w4 -s', {IN=>"abc ef\n"}, {OUT=>"abc \nef\n"}],
- );
- +# Add _POSIX2_VERSION=199209 to the environment of each test
- +# that uses an old-style option like +1.
- +if ($mb_locale ne 'C')
- + {
- + # Duplicate each test vector, appending "-mb" to the test name and
- + # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
- + # provide coverage for the distro-added multi-byte code paths.
- + my @new;
- + foreach my $t (@Tests)
- + {
- + my @new_t = @$t;
- + my $test_name = shift @new_t;
- +
- + # Depending on whether fold is multi-byte-patched,
- + # it emits different diagnostics:
- + # non-MB: invalid byte or field list
- + # MB: invalid byte, character or field list
- + # Adjust the expected error output accordingly.
- + if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
- + (@new_t))
- + {
- + my $sub = {ERR_SUBST => 's/, character//'};
- + push @new_t, $sub;
- + push @$t, $sub;
- + }
- + push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
- + }
- + push @Tests, @new;
- + }
- +
- +@Tests = triple_test \@Tests;
- +
- +# Remember that triple_test creates from each test with exactly one "IN"
- +# file two more tests (.p and .r suffix on name) corresponding to reading
- +# input from a file and from a pipe. The pipe-reading test would fail
- +# due to a race condition about 1 in 20 times.
- +# Remove the IN_PIPE version of the "output-is-input" test above.
- +# The others aren't susceptible because they have three inputs each.
- +@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
- +
- my $save_temps = $ENV{DEBUG};
- my $verbose = $ENV{VERBOSE};
- -my $prog = 'fold';
- my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose);
- exit $fail;
- diff --git a/tests/misc/join.pl b/tests/misc/join.pl
- index 06ad777..be40204 100755
- --- a/tests/misc/join.pl
- +++ b/tests/misc/join.pl
- @@ -25,6 +25,15 @@ my $limits = getlimits ();
- my $prog = 'join';
- +my $try = "Try \`$prog --help' for more information.\n";
- +my $inval = "$prog: invalid byte, character or field list\n$try";
- +
- +my $mb_locale;
- +#Comment out next line to disable multibyte tests
- +$mb_locale = $ENV{LOCALE_FR_UTF8};
- +! defined $mb_locale || $mb_locale eq 'none'
- + and $mb_locale = 'C';
- +
- my $delim = chr 0247;
- sub t_subst ($)
- {
- @@ -333,8 +342,49 @@ foreach my $t (@tv)
- push @Tests, $new_ent;
- }
- +# Add _POSIX2_VERSION=199209 to the environment of each test
- +# that uses an old-style option like +1.
- +if ($mb_locale ne 'C')
- + {
- + # Duplicate each test vector, appending "-mb" to the test name and
- + # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
- + # provide coverage for the distro-added multi-byte code paths.
- + my @new;
- + foreach my $t (@Tests)
- + {
- + my @new_t = @$t;
- + my $test_name = shift @new_t;
- +
- + # Depending on whether join is multi-byte-patched,
- + # it emits different diagnostics:
- + # non-MB: invalid byte or field list
- + # MB: invalid byte, character or field list
- + # Adjust the expected error output accordingly.
- + if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
- + (@new_t))
- + {
- + my $sub = {ERR_SUBST => 's/, character//'};
- + push @new_t, $sub;
- + push @$t, $sub;
- + }
- + #Adjust the output some error messages including test_name for mb
- + if (grep {ref $_ eq 'HASH' && exists $_->{ERR}}
- + (@new_t))
- + {
- + my $sub2 = {ERR_SUBST => "s/$test_name-mb/$test_name/"};
- + push @new_t, $sub2;
- + push @$t, $sub2;
- + }
- + push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
- + }
- + push @Tests, @new;
- + }
- +
- @Tests = triple_test \@Tests;
- +#skip invalid-j-mb test, it is failing because of the format
- +@Tests = grep {$_->[0] ne 'invalid-j-mb'} @Tests;
- +
- my $save_temps = $ENV{DEBUG};
- my $verbose = $ENV{VERBOSE};
- diff --git a/tests/misc/sort-mb-tests.sh b/tests/misc/sort-mb-tests.sh
- new file mode 100755
- index 0000000..11836ba
- --- /dev/null
- +++ b/tests/misc/sort-mb-tests.sh
- @@ -0,0 +1,45 @@
- +#!/bin/sh
- +# Verify sort's multi-byte support.
- +
- +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
- +print_ver_ sort
- +
- +export LC_ALL=en_US.UTF-8
- +locale -k LC_CTYPE | grep -q "charmap.*UTF-8" \
- + || skip_ "No UTF-8 locale available"
- +
- +
- +cat <<EOF > exp
- +Banana@5
- +Apple@10
- +Citrus@20
- +Cherry@30
- +EOF
- +
- +cat <<EOF | sort -t @ -k2 -n > out || fail=1
- +Apple@10
- +Banana@5
- +Citrus@20
- +Cherry@30
- +EOF
- +
- +compare exp out || { fail=1; cat out; }
- +
- +
- +cat <<EOF > exp
- +Citrus@AA20@@5
- +Cherry@AA30@@10
- +Apple@AA10@@20
- +Banana@AA5@@30
- +EOF
- +
- +cat <<EOF | sort -t @ -k4 -n > out || fail=1
- +Apple@AA10@@20
- +Banana@AA5@@30
- +Citrus@AA20@@5
- +Cherry@AA30@@10
- +EOF
- +
- +compare exp out || { fail=1; cat out; }
- +
- +Exit $fail
- diff --git a/tests/misc/sort-merge.pl b/tests/misc/sort-merge.pl
- index 7eb4574..eda884c 100755
- --- a/tests/misc/sort-merge.pl
- +++ b/tests/misc/sort-merge.pl
- @@ -26,6 +26,15 @@ my $prog = 'sort';
- # Turn off localization of executable's output.
- @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
- +my $mb_locale;
- +# uncommented according to upstream commit enabling multibyte paths
- +$mb_locale = $ENV{LOCALE_FR_UTF8};
- +! defined $mb_locale || $mb_locale eq 'none'
- + and $mb_locale = 'C';
- +
- +my $try = "Try \`$prog --help' for more information.\n";
- +my $inval = "$prog: invalid byte, character or field list\n$try";
- +
- # three empty files and one that says 'foo'
- my @inputs = (+(map{{IN=> {"empty$_"=> ''}}}1..3), {IN=> {foo=> "foo\n"}});
- @@ -77,6 +86,39 @@ my @Tests =
- {OUT=>$big_input}],
- );
- +# Add _POSIX2_VERSION=199209 to the environment of each test
- +# that uses an old-style option like +1.
- +if ($mb_locale ne 'C')
- + {
- + # Duplicate each test vector, appending "-mb" to the test name and
- + # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
- + # provide coverage for the distro-added multi-byte code paths.
- + my @new;
- + foreach my $t (@Tests)
- + {
- + my @new_t = @$t;
- + my $test_name = shift @new_t;
- +
- + # Depending on whether sort is multi-byte-patched,
- + # it emits different diagnostics:
- + # non-MB: invalid byte or field list
- + # MB: invalid byte, character or field list
- + # Adjust the expected error output accordingly.
- + if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
- + (@new_t))
- + {
- + my $sub = {ERR_SUBST => 's/, character//'};
- + push @new_t, $sub;
- + push @$t, $sub;
- + }
- + next if ($test_name =~ "nmerge-.");
- + push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
- + }
- + push @Tests, @new;
- + }
- +
- +@Tests = triple_test \@Tests;
- +
- my $save_temps = $ENV{DEBUG};
- my $verbose = $ENV{VERBOSE};
- diff --git a/tests/misc/sort.pl b/tests/misc/sort.pl
- index 0b0adca..fd27821 100755
- --- a/tests/misc/sort.pl
- +++ b/tests/misc/sort.pl
- @@ -24,10 +24,15 @@ my $prog = 'sort';
- # Turn off localization of executable's output.
- @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
- -my $mb_locale = $ENV{LOCALE_FR_UTF8};
- +my $mb_locale;
- +#Comment out next line to disable multibyte tests
- +$mb_locale = $ENV{LOCALE_FR_UTF8};
- ! defined $mb_locale || $mb_locale eq 'none'
- and $mb_locale = 'C';
- +my $try = "Try \`$prog --help' for more information.\n";
- +my $inval = "$prog: invalid byte, character or field list\n$try";
- +
- # Since each test is run with a file name and with redirected stdin,
- # the name in the diagnostic is either the file name or "-".
- # Normalize each diagnostic to use '-'.
- @@ -423,6 +428,38 @@ foreach my $t (@Tests)
- }
- }
- +if ($mb_locale ne 'C')
- + {
- + # Duplicate each test vector, appending "-mb" to the test name and
- + # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
- + # provide coverage for the distro-added multi-byte code paths.
- + my @new;
- + foreach my $t (@Tests)
- + {
- + my @new_t = @$t;
- + my $test_name = shift @new_t;
- +
- + # Depending on whether sort is multi-byte-patched,
- + # it emits different diagnostics:
- + # non-MB: invalid byte or field list
- + # MB: invalid byte, character or field list
- + # Adjust the expected error output accordingly.
- + if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
- + (@new_t))
- + {
- + my $sub = {ERR_SUBST => 's/, character//'};
- + push @new_t, $sub;
- + push @$t, $sub;
- + }
- + #disable several failing tests until investigation, disable all tests with envvars set
- + next if (grep {ref $_ eq 'HASH' && exists $_->{ENV}} (@new_t));
- + next if ($test_name =~ "18g" or $test_name =~ "sort-numeric" or $test_name =~ "08[ab]" or $test_name =~ "03[def]" or $test_name =~ "h4" or $test_name =~ "n1" or $test_name =~ "2[01]a");
- + next if ($test_name =~ "11[ab]"); # avoid FP: expected result differs to MB result due to collation rules.
- + push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
- + }
- + push @Tests, @new;
- + }
- +
- @Tests = triple_test \@Tests;
- # Remember that triple_test creates from each test with exactly one "IN"
- @@ -432,6 +469,7 @@ foreach my $t (@Tests)
- # Remove the IN_PIPE version of the "output-is-input" test above.
- # The others aren't susceptible because they have three inputs each.
- @Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
- +@Tests = grep {$_->[0] ne 'output-is-input-mb.p'} @Tests;
- my $save_temps = $ENV{DEBUG};
- my $verbose = $ENV{VERBOSE};
- diff --git a/tests/misc/unexpand.pl b/tests/misc/unexpand.pl
- index 2e1906f..fe66012 100755
- --- a/tests/misc/unexpand.pl
- +++ b/tests/misc/unexpand.pl
- @@ -27,6 +27,14 @@ my $limits = getlimits ();
- my $prog = 'unexpand';
- +# comment out next line to disable multibyte tests
- +my $mb_locale = $ENV{LOCALE_FR_UTF8};
- +! defined $mb_locale || $mb_locale eq 'none'
- + and $mb_locale = 'C';
- +
- +my $try = "Try \`$prog --help' for more information.\n";
- +my $inval = "$prog: invalid byte, character or field list\n$try";
- +
- my @Tests =
- (
- ['a1', {IN=> ' 'x 1 ."y\n"}, {OUT=> ' 'x 1 ."y\n"}],
- @@ -128,6 +136,37 @@ my @Tests =
- ['ts2', '-t5,8', {IN=>"x\t \t y\n"}, {OUT=>"x\t\t y\n"}],
- );
- +if ($mb_locale ne 'C')
- + {
- + # Duplicate each test vector, appending "-mb" to the test name and
- + # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
- + # provide coverage for the distro-added multi-byte code paths.
- + my @new;
- + foreach my $t (@Tests)
- + {
- + my @new_t = @$t;
- + my $test_name = shift @new_t;
- +
- + # Depending on whether unexpand is multi-byte-patched,
- + # it emits different diagnostics:
- + # non-MB: invalid byte or field list
- + # MB: invalid byte, character or field list
- + # Adjust the expected error output accordingly.
- + if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
- + (@new_t))
- + {
- + my $sub = {ERR_SUBST => 's/, character//'};
- + push @new_t, $sub;
- + push @$t, $sub;
- + }
- + next if ($test_name =~ 'b-1');
- + push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
- + }
- + push @Tests, @new;
- + }
- +
- +@Tests = triple_test \@Tests;
- +
- my $save_temps = $ENV{DEBUG};
- my $verbose = $ENV{VERBOSE};
- diff --git a/tests/misc/uniq.pl b/tests/misc/uniq.pl
- index aa163cd..91d617d 100755
- --- a/tests/misc/uniq.pl
- +++ b/tests/misc/uniq.pl
- @@ -23,9 +23,17 @@ my $limits = getlimits ();
- my $prog = 'uniq';
- my $try = "Try '$prog --help' for more information.\n";
- +my $inval = "$prog: invalid byte, character or field list\n$try";
- +
- # Turn off localization of executable's output.
- @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
- +my $mb_locale;
- +#Comment out next line to disable multibyte tests
- +$mb_locale = $ENV{LOCALE_FR_UTF8};
- +! defined $mb_locale || $mb_locale eq 'none'
- + and $mb_locale = 'C';
- +
- # When possible, create a "-z"-testing variant of each test.
- sub add_z_variants($)
- {
- @@ -262,6 +270,53 @@ foreach my $t (@Tests)
- and push @$t, {ENV=>'_POSIX2_VERSION=199209'};
- }
- +if ($mb_locale ne 'C')
- + {
- + # Duplicate each test vector, appending "-mb" to the test name and
- + # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
- + # provide coverage for the distro-added multi-byte code paths.
- + my @new;
- + foreach my $t (@Tests)
- + {
- + my @new_t = @$t;
- + my $test_name = shift @new_t;
- +
- + # Depending on whether uniq is multi-byte-patched,
- + # it emits different diagnostics:
- + # non-MB: invalid byte or field list
- + # MB: invalid byte, character or field list
- + # Adjust the expected error output accordingly.
- + if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
- + (@new_t))
- + {
- + my $sub = {ERR_SUBST => 's/, character//'};
- + push @new_t, $sub;
- + push @$t, $sub;
- + }
- + # In test #145, replace the each ‘...’ by '...'.
- + if ($test_name =~ "145")
- + {
- + my $sub = { ERR_SUBST => "s/‘([^’]+)’/'\$1'/g"};
- + push @new_t, $sub;
- + push @$t, $sub;
- + }
- + next if ( $test_name =~ "schar"
- + or $test_name =~ "^obs-plus"
- + or $test_name =~ "119");
- + push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
- + }
- + push @Tests, @new;
- + }
- +
- +# Remember that triple_test creates from each test with exactly one "IN"
- +# file two more tests (.p and .r suffix on name) corresponding to reading
- +# input from a file and from a pipe. The pipe-reading test would fail
- +# due to a race condition about 1 in 20 times.
- +# Remove the IN_PIPE version of the "output-is-input" test above.
- +# The others aren't susceptible because they have three inputs each.
- +
- +@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
- +
- @Tests = add_z_variants \@Tests;
- @Tests = triple_test \@Tests;
- diff --git a/tests/pr/pr-tests.pl b/tests/pr/pr-tests.pl
- index 7ac6d4c..ae6cc35 100755
- --- a/tests/pr/pr-tests.pl
- +++ b/tests/pr/pr-tests.pl
- @@ -24,6 +24,15 @@ use strict;
- my $prog = 'pr';
- my $normalize_strerror = "s/': .*/'/";
- +my $mb_locale;
- +#Uncomment the following line to enable multibyte tests
- +$mb_locale = $ENV{LOCALE_FR_UTF8};
- +! defined $mb_locale || $mb_locale eq 'none'
- + and $mb_locale = 'C';
- +
- +my $try = "Try \`$prog --help' for more information.\n";
- +my $inval = "$prog: invalid byte, character or field list\n$try";
- +
- my @tv = (
- # -b option is no longer an official option. But it's still working to
- @@ -512,8 +521,48 @@ push @Tests,
- {IN=>"x\tx\tx\tx\tx\nx\tx\tx\tx\tx\n"},
- {OUT=>"x\tx\tx\tx\tx\tx\tx\tx\tx\tx\n"} ];
- +# Add _POSIX2_VERSION=199209 to the environment of each test
- +# that uses an old-style option like +1.
- +if ($mb_locale ne 'C')
- + {
- + # Duplicate each test vector, appending "-mb" to the test name and
- + # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
- + # provide coverage for the distro-added multi-byte code paths.
- + my @new;
- + foreach my $t (@Tests)
- + {
- + my @new_t = @$t;
- + my $test_name = shift @new_t;
- +
- + # Depending on whether pr is multi-byte-patched,
- + # it emits different diagnostics:
- + # non-MB: invalid byte or field list
- + # MB: invalid byte, character or field list
- + # Adjust the expected error output accordingly.
- + if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
- + (@new_t))
- + {
- + my $sub = {ERR_SUBST => 's/, character//'};
- + push @new_t, $sub;
- + push @$t, $sub;
- + }
- + #temporarily skip some failing tests
- + next if ($test_name =~ "col-0" or $test_name =~ "col-inval" or $test_name =~ "asan1");
- + push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
- + }
- + push @Tests, @new;
- + }
- +
- @Tests = triple_test \@Tests;
- +# Remember that triple_test creates from each test with exactly one "IN"
- +# file two more tests (.p and .r suffix on name) corresponding to reading
- +# input from a file and from a pipe. The pipe-reading test would fail
- +# due to a race condition about 1 in 20 times.
- +# Remove the IN_PIPE version of the "output-is-input" test above.
- +# The others aren't susceptible because they have three inputs each.
- +@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
- +
- my $save_temps = $ENV{DEBUG};
- my $verbose = $ENV{VERBOSE};
- diff --git a/tests/unexpand/mb.sh b/tests/unexpand/mb.sh
- new file mode 100755
- index 0000000..8a82d74
- --- /dev/null
- +++ b/tests/unexpand/mb.sh
- @@ -0,0 +1,172 @@
- +#!/bin/sh
- +
- +# Copyright (C) 2012-2015 Free Software Foundation, Inc.
- +
- +# This program is free software: you can redistribute it and/or modify
- +# it under the terms of the GNU General Public License as published by
- +# the Free Software Foundation, either version 3 of the License, or
- +# (at your option) any later version.
- +
- +# This program is distributed in the hope that it will be useful,
- +# but WITHOUT ANY WARRANTY; without even the implied warranty of
- +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- +# GNU General Public License for more details.
- +
- +# You should have received a copy of the GNU General Public License
- +# along with this program. If not, see <http://www.gnu.org/licenses/>.
- +
- +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
- +print_ver_ unexpand
- +
- +export LC_ALL=en_US.UTF-8
- +
- +#input containing multibyte characters
- +cat > in <<\EOF
- +1234567812345678123456781
- +. . . .
- +a b c d
- +. . . .
- +ä ö ü ß
- +. . . .
- + äöü . öüä. ä xx
- +EOF
- +
- +cat > exp <<\EOF
- +1234567812345678123456781
- +. . . .
- +a b c d
- +. . . .
- +ä ö ü ß
- +. . . .
- + äöü . öüä. ä xx
- +EOF
- +
- +unexpand -a < in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +
- +#multiple files as an input
- +cat >> exp <<\EOF
- +1234567812345678123456781
- +. . . .
- +a b c d
- +. . . .
- +ä ö ü ß
- +. . . .
- + äöü . öüä. ä xx
- +EOF
- +
- +
- +unexpand -a ./in ./in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +#test characters with a display width larger than 1
- +
- +env printf '12345678
- +e |ascii(1)
- +\u00E9 |composed(1)
- +e\u0301 |decomposed(1)
- +\u3000 |ideo-space(2)
- +\uFF0D |full-hypen(2)
- +' > in || framework_failure_
- +
- +env printf '12345678
- +e\t|ascii(1)
- +\u00E9\t|composed(1)
- +e\u0301\t|decomposed(1)
- +\u3000\t|ideo-space(2)
- +\uFF0D\t|full-hypen(2)
- +' > exp || framework_failure_
- +
- +unexpand -a < in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +#test input where a blank of width > 1 is not being substituted
- +in="$(LC_ALL=en_US.UTF-8 printf ' \u3000 ö ü ß')"
- +exp=' ö ü ß'
- +
- +unexpand -a < in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +#non-Unicode characters interspersed between Unicode ones
- +env printf '12345678
- + \xFF|
- +\xFF |
- + \xFFä|
- +ä\xFF |
- + ä\xFF|
- +\xFF ä|
- +äbcdef\xFF |
- +' > in || framework_failure_
- +
- +env printf '12345678
- +\t\xFF|
- +\xFF\t|
- +\t\xFFä|
- +ä\xFF\t|
- +\tä\xFF|
- +\xFF\tä|
- +äbcdef\xFF\t|
- +' > exp || framework_failure_
- +
- +unexpand -a < in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +#BOM header test 1
- +printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_
- +1234567812345678123456781
- +. . . .
- +a b c d
- +. . . .
- +ä ö ü ß
- +. . . .
- + äöü . öüä. ä xx
- +EOF
- +env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_
- +
- +printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
- +1234567812345678123456781
- +. . . .
- +a b c d
- +. . . .
- +ä ö ü ß
- +. . . .
- + äöü . öüä. ä xx
- +EOF
- +
- +unexpand < in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +LANG=C unexpand < in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +LC_ALL=C unexpand < in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +
- +printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
- +1234567812345678123456781
- +. . . .
- +a b c d
- +. . . .
- +ä ö ü ß
- +. . . .
- + äöü . öüä. ä xx
- +1234567812345678123456781
- +. . . .
- +a b c d
- +. . . .
- +ä ö ü ß
- +. . . .
- + äöü . öüä. ä xx
- +EOF
- +
- +
- +unexpand in in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +LANG=C unexpand in in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- +
- +LC_ALL=C unexpand in in > out || fail=1
- +compare exp out > /dev/null 2>&1 || fail=1
- --
- 2.34.1
|