5002_BFQ-2-block-introduce-the-v7r5-I-O-sched-for-3.16.patch1 205 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636
  1. From c56e6c5db41f7137d3e0b38063ef0c944eec1898 Mon Sep 17 00:00:00 2001
  2. From: Paolo Valente <paolo.valente@unimore.it>
  3. Date: Thu, 9 May 2013 19:10:02 +0200
  4. Subject: [PATCH 2/3] block: introduce the BFQ-v7r5 I/O sched for 3.16
  5. Add the BFQ-v7r5 I/O scheduler to 3.16.
  6. The general structure is borrowed from CFQ, as much of the code for
  7. handling I/O contexts. Over time, several useful features have been
  8. ported from CFQ as well (details in the changelog in README.BFQ). A
  9. (bfq_)queue is associated to each task doing I/O on a device, and each
  10. time a scheduling decision has to be made a queue is selected and served
  11. until it expires.
  12. - Slices are given in the service domain: tasks are assigned
  13. budgets, measured in number of sectors. Once got the disk, a task
  14. must however consume its assigned budget within a configurable
  15. maximum time (by default, the maximum possible value of the
  16. budgets is automatically computed to comply with this timeout).
  17. This allows the desired latency vs "throughput boosting" tradeoff
  18. to be set.
  19. - Budgets are scheduled according to a variant of WF2Q+, implemented
  20. using an augmented rb-tree to take eligibility into account while
  21. preserving an O(log N) overall complexity.
  22. - A low-latency tunable is provided; if enabled, both interactive
  23. and soft real-time applications are guaranteed a very low latency.
  24. - Latency guarantees are preserved also in the presence of NCQ.
  25. - Also with flash-based devices, a high throughput is achieved
  26. while still preserving latency guarantees.
  27. - BFQ features Early Queue Merge (EQM), a sort of fusion of the
  28. cooperating-queue-merging and the preemption mechanisms present
  29. in CFQ. EQM is in fact a unified mechanism that tries to get a
  30. sequential read pattern, and hence a high throughput, with any
  31. set of processes performing interleaved I/O over a contiguous
  32. sequence of sectors.
  33. - BFQ supports full hierarchical scheduling, exporting a cgroups
  34. interface. Since each node has a full scheduler, each group can
  35. be assigned its own weight.
  36. - If the cgroups interface is not used, only I/O priorities can be
  37. assigned to processes, with ioprio values mapped to weights
  38. with the relation weight = IOPRIO_BE_NR - ioprio.
  39. - ioprio classes are served in strict priority order, i.e., lower
  40. priority queues are not served as long as there are higher
  41. priority queues. Among queues in the same class the bandwidth is
  42. distributed in proportion to the weight of each queue. A very
  43. thin extra bandwidth is however guaranteed to the Idle class, to
  44. prevent it from starving.
  45. Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
  46. Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
  47. ---
  48. block/bfq-cgroup.c | 930 +++++++++++++
  49. block/bfq-ioc.c | 36 +
  50. block/bfq-iosched.c | 3617 +++++++++++++++++++++++++++++++++++++++++++++++++++
  51. block/bfq-sched.c | 1207 +++++++++++++++++
  52. block/bfq.h | 742 +++++++++++
  53. 5 files changed, 6532 insertions(+)
  54. create mode 100644 block/bfq-cgroup.c
  55. create mode 100644 block/bfq-ioc.c
  56. create mode 100644 block/bfq-iosched.c
  57. create mode 100644 block/bfq-sched.c
  58. create mode 100644 block/bfq.h
  59. diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
  60. new file mode 100644
  61. index 0000000..f742806
  62. --- /dev/null
  63. +++ b/block/bfq-cgroup.c
  64. @@ -0,0 +1,930 @@
  65. +/*
  66. + * BFQ: CGROUPS support.
  67. + *
  68. + * Based on ideas and code from CFQ:
  69. + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  70. + *
  71. + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  72. + * Paolo Valente <paolo.valente@unimore.it>
  73. + *
  74. + * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
  75. + *
  76. + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
  77. + * file.
  78. + */
  79. +
  80. +#ifdef CONFIG_CGROUP_BFQIO
  81. +
  82. +static DEFINE_MUTEX(bfqio_mutex);
  83. +
  84. +static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)
  85. +{
  86. + return bgrp ? !bgrp->online : false;
  87. +}
  88. +
  89. +static struct bfqio_cgroup bfqio_root_cgroup = {
  90. + .weight = BFQ_DEFAULT_GRP_WEIGHT,
  91. + .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
  92. + .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
  93. +};
  94. +
  95. +static inline void bfq_init_entity(struct bfq_entity *entity,
  96. + struct bfq_group *bfqg)
  97. +{
  98. + entity->weight = entity->new_weight;
  99. + entity->orig_weight = entity->new_weight;
  100. + entity->ioprio = entity->new_ioprio;
  101. + entity->ioprio_class = entity->new_ioprio_class;
  102. + entity->parent = bfqg->my_entity;
  103. + entity->sched_data = &bfqg->sched_data;
  104. +}
  105. +
  106. +static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)
  107. +{
  108. + return css ? container_of(css, struct bfqio_cgroup, css) : NULL;
  109. +}
  110. +
  111. +/*
  112. + * Search the bfq_group for bfqd into the hash table (by now only a list)
  113. + * of bgrp. Must be called under rcu_read_lock().
  114. + */
  115. +static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
  116. + struct bfq_data *bfqd)
  117. +{
  118. + struct bfq_group *bfqg;
  119. + void *key;
  120. +
  121. + hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
  122. + key = rcu_dereference(bfqg->bfqd);
  123. + if (key == bfqd)
  124. + return bfqg;
  125. + }
  126. +
  127. + return NULL;
  128. +}
  129. +
  130. +static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
  131. + struct bfq_group *bfqg)
  132. +{
  133. + struct bfq_entity *entity = &bfqg->entity;
  134. +
  135. + /*
  136. + * If the weight of the entity has never been set via the sysfs
  137. + * interface, then bgrp->weight == 0. In this case we initialize
  138. + * the weight from the current ioprio value. Otherwise, the group
  139. + * weight, if set, has priority over the ioprio value.
  140. + */
  141. + if (bgrp->weight == 0) {
  142. + entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
  143. + entity->new_ioprio = bgrp->ioprio;
  144. + } else {
  145. + entity->new_weight = bgrp->weight;
  146. + entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
  147. + }
  148. + entity->orig_weight = entity->weight = entity->new_weight;
  149. + entity->ioprio = entity->new_ioprio;
  150. + entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
  151. + entity->my_sched_data = &bfqg->sched_data;
  152. + bfqg->active_entities = 0;
  153. +}
  154. +
  155. +static inline void bfq_group_set_parent(struct bfq_group *bfqg,
  156. + struct bfq_group *parent)
  157. +{
  158. + struct bfq_entity *entity;
  159. +
  160. + BUG_ON(parent == NULL);
  161. + BUG_ON(bfqg == NULL);
  162. +
  163. + entity = &bfqg->entity;
  164. + entity->parent = parent->my_entity;
  165. + entity->sched_data = &parent->sched_data;
  166. +}
  167. +
  168. +/**
  169. + * bfq_group_chain_alloc - allocate a chain of groups.
  170. + * @bfqd: queue descriptor.
  171. + * @css: the leaf cgroup_subsys_state this chain starts from.
  172. + *
  173. + * Allocate a chain of groups starting from the one belonging to
  174. + * @cgroup up to the root cgroup. Stop if a cgroup on the chain
  175. + * to the root has already an allocated group on @bfqd.
  176. + */
  177. +static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
  178. + struct cgroup_subsys_state *css)
  179. +{
  180. + struct bfqio_cgroup *bgrp;
  181. + struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
  182. +
  183. + for (; css != NULL; css = css->parent) {
  184. + bgrp = css_to_bfqio(css);
  185. +
  186. + bfqg = bfqio_lookup_group(bgrp, bfqd);
  187. + if (bfqg != NULL) {
  188. + /*
  189. + * All the cgroups in the path from there to the
  190. + * root must have a bfq_group for bfqd, so we don't
  191. + * need any more allocations.
  192. + */
  193. + break;
  194. + }
  195. +
  196. + bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
  197. + if (bfqg == NULL)
  198. + goto cleanup;
  199. +
  200. + bfq_group_init_entity(bgrp, bfqg);
  201. + bfqg->my_entity = &bfqg->entity;
  202. +
  203. + if (leaf == NULL) {
  204. + leaf = bfqg;
  205. + prev = leaf;
  206. + } else {
  207. + bfq_group_set_parent(prev, bfqg);
  208. + /*
  209. + * Build a list of allocated nodes using the bfqd
  210. + * filed, that is still unused and will be
  211. + * initialized only after the node will be
  212. + * connected.
  213. + */
  214. + prev->bfqd = bfqg;
  215. + prev = bfqg;
  216. + }
  217. + }
  218. +
  219. + return leaf;
  220. +
  221. +cleanup:
  222. + while (leaf != NULL) {
  223. + prev = leaf;
  224. + leaf = leaf->bfqd;
  225. + kfree(prev);
  226. + }
  227. +
  228. + return NULL;
  229. +}
  230. +
  231. +/**
  232. + * bfq_group_chain_link - link an allocated group chain to a cgroup
  233. + * hierarchy.
  234. + * @bfqd: the queue descriptor.
  235. + * @css: the leaf cgroup_subsys_state to start from.
  236. + * @leaf: the leaf group (to be associated to @cgroup).
  237. + *
  238. + * Try to link a chain of groups to a cgroup hierarchy, connecting the
  239. + * nodes bottom-up, so we can be sure that when we find a cgroup in the
  240. + * hierarchy that already as a group associated to @bfqd all the nodes
  241. + * in the path to the root cgroup have one too.
  242. + *
  243. + * On locking: the queue lock protects the hierarchy (there is a hierarchy
  244. + * per device) while the bfqio_cgroup lock protects the list of groups
  245. + * belonging to the same cgroup.
  246. + */
  247. +static void bfq_group_chain_link(struct bfq_data *bfqd,
  248. + struct cgroup_subsys_state *css,
  249. + struct bfq_group *leaf)
  250. +{
  251. + struct bfqio_cgroup *bgrp;
  252. + struct bfq_group *bfqg, *next, *prev = NULL;
  253. + unsigned long flags;
  254. +
  255. + assert_spin_locked(bfqd->queue->queue_lock);
  256. +
  257. + for (; css != NULL && leaf != NULL; css = css->parent) {
  258. + bgrp = css_to_bfqio(css);
  259. + next = leaf->bfqd;
  260. +
  261. + bfqg = bfqio_lookup_group(bgrp, bfqd);
  262. + BUG_ON(bfqg != NULL);
  263. +
  264. + spin_lock_irqsave(&bgrp->lock, flags);
  265. +
  266. + rcu_assign_pointer(leaf->bfqd, bfqd);
  267. + hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
  268. + hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
  269. +
  270. + spin_unlock_irqrestore(&bgrp->lock, flags);
  271. +
  272. + prev = leaf;
  273. + leaf = next;
  274. + }
  275. +
  276. + BUG_ON(css == NULL && leaf != NULL);
  277. + if (css != NULL && prev != NULL) {
  278. + bgrp = css_to_bfqio(css);
  279. + bfqg = bfqio_lookup_group(bgrp, bfqd);
  280. + bfq_group_set_parent(prev, bfqg);
  281. + }
  282. +}
  283. +
  284. +/**
  285. + * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
  286. + * @bfqd: queue descriptor.
  287. + * @cgroup: cgroup being searched for.
  288. + *
  289. + * Return a group associated to @bfqd in @cgroup, allocating one if
  290. + * necessary. When a group is returned all the cgroups in the path
  291. + * to the root have a group associated to @bfqd.
  292. + *
  293. + * If the allocation fails, return the root group: this breaks guarantees
  294. + * but is a safe fallback. If this loss becomes a problem it can be
  295. + * mitigated using the equivalent weight (given by the product of the
  296. + * weights of the groups in the path from @group to the root) in the
  297. + * root scheduler.
  298. + *
  299. + * We allocate all the missing nodes in the path from the leaf cgroup
  300. + * to the root and we connect the nodes only after all the allocations
  301. + * have been successful.
  302. + */
  303. +static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
  304. + struct cgroup_subsys_state *css)
  305. +{
  306. + struct bfqio_cgroup *bgrp = css_to_bfqio(css);
  307. + struct bfq_group *bfqg;
  308. +
  309. + bfqg = bfqio_lookup_group(bgrp, bfqd);
  310. + if (bfqg != NULL)
  311. + return bfqg;
  312. +
  313. + bfqg = bfq_group_chain_alloc(bfqd, css);
  314. + if (bfqg != NULL)
  315. + bfq_group_chain_link(bfqd, css, bfqg);
  316. + else
  317. + bfqg = bfqd->root_group;
  318. +
  319. + return bfqg;
  320. +}
  321. +
  322. +/**
  323. + * bfq_bfqq_move - migrate @bfqq to @bfqg.
  324. + * @bfqd: queue descriptor.
  325. + * @bfqq: the queue to move.
  326. + * @entity: @bfqq's entity.
  327. + * @bfqg: the group to move to.
  328. + *
  329. + * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
  330. + * it on the new one. Avoid putting the entity on the old group idle tree.
  331. + *
  332. + * Must be called under the queue lock; the cgroup owning @bfqg must
  333. + * not disappear (by now this just means that we are called under
  334. + * rcu_read_lock()).
  335. + */
  336. +static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  337. + struct bfq_entity *entity, struct bfq_group *bfqg)
  338. +{
  339. + int busy, resume;
  340. +
  341. + busy = bfq_bfqq_busy(bfqq);
  342. + resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
  343. +
  344. + BUG_ON(resume && !entity->on_st);
  345. + BUG_ON(busy && !resume && entity->on_st &&
  346. + bfqq != bfqd->in_service_queue);
  347. +
  348. + if (busy) {
  349. + BUG_ON(atomic_read(&bfqq->ref) < 2);
  350. +
  351. + if (!resume)
  352. + bfq_del_bfqq_busy(bfqd, bfqq, 0);
  353. + else
  354. + bfq_deactivate_bfqq(bfqd, bfqq, 0);
  355. + } else if (entity->on_st)
  356. + bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
  357. +
  358. + /*
  359. + * Here we use a reference to bfqg. We don't need a refcounter
  360. + * as the cgroup reference will not be dropped, so that its
  361. + * destroy() callback will not be invoked.
  362. + */
  363. + entity->parent = bfqg->my_entity;
  364. + entity->sched_data = &bfqg->sched_data;
  365. +
  366. + if (busy && resume)
  367. + bfq_activate_bfqq(bfqd, bfqq);
  368. +
  369. + if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)
  370. + bfq_schedule_dispatch(bfqd);
  371. +}
  372. +
  373. +/**
  374. + * __bfq_bic_change_cgroup - move @bic to @cgroup.
  375. + * @bfqd: the queue descriptor.
  376. + * @bic: the bic to move.
  377. + * @cgroup: the cgroup to move to.
  378. + *
  379. + * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
  380. + * has to make sure that the reference to cgroup is valid across the call.
  381. + *
  382. + * NOTE: an alternative approach might have been to store the current
  383. + * cgroup in bfqq and getting a reference to it, reducing the lookup
  384. + * time here, at the price of slightly more complex code.
  385. + */
  386. +static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
  387. + struct bfq_io_cq *bic,
  388. + struct cgroup_subsys_state *css)
  389. +{
  390. + struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
  391. + struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
  392. + struct bfq_entity *entity;
  393. + struct bfq_group *bfqg;
  394. + struct bfqio_cgroup *bgrp;
  395. +
  396. + bgrp = css_to_bfqio(css);
  397. +
  398. + bfqg = bfq_find_alloc_group(bfqd, css);
  399. + if (async_bfqq != NULL) {
  400. + entity = &async_bfqq->entity;
  401. +
  402. + if (entity->sched_data != &bfqg->sched_data) {
  403. + bic_set_bfqq(bic, NULL, 0);
  404. + bfq_log_bfqq(bfqd, async_bfqq,
  405. + "bic_change_group: %p %d",
  406. + async_bfqq, atomic_read(&async_bfqq->ref));
  407. + bfq_put_queue(async_bfqq);
  408. + }
  409. + }
  410. +
  411. + if (sync_bfqq != NULL) {
  412. + entity = &sync_bfqq->entity;
  413. + if (entity->sched_data != &bfqg->sched_data)
  414. + bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
  415. + }
  416. +
  417. + return bfqg;
  418. +}
  419. +
  420. +/**
  421. + * bfq_bic_change_cgroup - move @bic to @cgroup.
  422. + * @bic: the bic being migrated.
  423. + * @cgroup: the destination cgroup.
  424. + *
  425. + * When the task owning @bic is moved to @cgroup, @bic is immediately
  426. + * moved into its new parent group.
  427. + */
  428. +static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
  429. + struct cgroup_subsys_state *css)
  430. +{
  431. + struct bfq_data *bfqd;
  432. + unsigned long uninitialized_var(flags);
  433. +
  434. + bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
  435. + &flags);
  436. + if (bfqd != NULL) {
  437. + __bfq_bic_change_cgroup(bfqd, bic, css);
  438. + bfq_put_bfqd_unlock(bfqd, &flags);
  439. + }
  440. +}
  441. +
  442. +/**
  443. + * bfq_bic_update_cgroup - update the cgroup of @bic.
  444. + * @bic: the @bic to update.
  445. + *
  446. + * Make sure that @bic is enqueued in the cgroup of the current task.
  447. + * We need this in addition to moving bics during the cgroup attach
  448. + * phase because the task owning @bic could be at its first disk
  449. + * access or we may end up in the root cgroup as the result of a
  450. + * memory allocation failure and here we try to move to the right
  451. + * group.
  452. + *
  453. + * Must be called under the queue lock. It is safe to use the returned
  454. + * value even after the rcu_read_unlock() as the migration/destruction
  455. + * paths act under the queue lock too. IOW it is impossible to race with
  456. + * group migration/destruction and end up with an invalid group as:
  457. + * a) here cgroup has not yet been destroyed, nor its destroy callback
  458. + * has started execution, as current holds a reference to it,
  459. + * b) if it is destroyed after rcu_read_unlock() [after current is
  460. + * migrated to a different cgroup] its attach() callback will have
  461. + * taken care of remove all the references to the old cgroup data.
  462. + */
  463. +static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
  464. +{
  465. + struct bfq_data *bfqd = bic_to_bfqd(bic);
  466. + struct bfq_group *bfqg;
  467. + struct cgroup_subsys_state *css;
  468. +
  469. + BUG_ON(bfqd == NULL);
  470. +
  471. + rcu_read_lock();
  472. + css = task_css(current, bfqio_cgrp_id);
  473. + bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);
  474. + rcu_read_unlock();
  475. +
  476. + return bfqg;
  477. +}
  478. +
  479. +/**
  480. + * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
  481. + * @st: the service tree being flushed.
  482. + */
  483. +static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
  484. +{
  485. + struct bfq_entity *entity = st->first_idle;
  486. +
  487. + for (; entity != NULL; entity = st->first_idle)
  488. + __bfq_deactivate_entity(entity, 0);
  489. +}
  490. +
  491. +/**
  492. + * bfq_reparent_leaf_entity - move leaf entity to the root_group.
  493. + * @bfqd: the device data structure with the root group.
  494. + * @entity: the entity to move.
  495. + */
  496. +static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
  497. + struct bfq_entity *entity)
  498. +{
  499. + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
  500. +
  501. + BUG_ON(bfqq == NULL);
  502. + bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
  503. + return;
  504. +}
  505. +
  506. +/**
  507. + * bfq_reparent_active_entities - move to the root group all active
  508. + * entities.
  509. + * @bfqd: the device data structure with the root group.
  510. + * @bfqg: the group to move from.
  511. + * @st: the service tree with the entities.
  512. + *
  513. + * Needs queue_lock to be taken and reference to be valid over the call.
  514. + */
  515. +static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
  516. + struct bfq_group *bfqg,
  517. + struct bfq_service_tree *st)
  518. +{
  519. + struct rb_root *active = &st->active;
  520. + struct bfq_entity *entity = NULL;
  521. +
  522. + if (!RB_EMPTY_ROOT(&st->active))
  523. + entity = bfq_entity_of(rb_first(active));
  524. +
  525. + for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))
  526. + bfq_reparent_leaf_entity(bfqd, entity);
  527. +
  528. + if (bfqg->sched_data.in_service_entity != NULL)
  529. + bfq_reparent_leaf_entity(bfqd,
  530. + bfqg->sched_data.in_service_entity);
  531. +
  532. + return;
  533. +}
  534. +
  535. +/**
  536. + * bfq_destroy_group - destroy @bfqg.
  537. + * @bgrp: the bfqio_cgroup containing @bfqg.
  538. + * @bfqg: the group being destroyed.
  539. + *
  540. + * Destroy @bfqg, making sure that it is not referenced from its parent.
  541. + */
  542. +static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
  543. +{
  544. + struct bfq_data *bfqd;
  545. + struct bfq_service_tree *st;
  546. + struct bfq_entity *entity = bfqg->my_entity;
  547. + unsigned long uninitialized_var(flags);
  548. + int i;
  549. +
  550. + hlist_del(&bfqg->group_node);
  551. +
  552. + /*
  553. + * Empty all service_trees belonging to this group before
  554. + * deactivating the group itself.
  555. + */
  556. + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
  557. + st = bfqg->sched_data.service_tree + i;
  558. +
  559. + /*
  560. + * The idle tree may still contain bfq_queues belonging
  561. + * to exited task because they never migrated to a different
  562. + * cgroup from the one being destroyed now. No one else
  563. + * can access them so it's safe to act without any lock.
  564. + */
  565. + bfq_flush_idle_tree(st);
  566. +
  567. + /*
  568. + * It may happen that some queues are still active
  569. + * (busy) upon group destruction (if the corresponding
  570. + * processes have been forced to terminate). We move
  571. + * all the leaf entities corresponding to these queues
  572. + * to the root_group.
  573. + * Also, it may happen that the group has an entity
  574. + * in service, which is disconnected from the active
  575. + * tree: it must be moved, too.
  576. + * There is no need to put the sync queues, as the
  577. + * scheduler has taken no reference.
  578. + */
  579. + bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
  580. + if (bfqd != NULL) {
  581. + bfq_reparent_active_entities(bfqd, bfqg, st);
  582. + bfq_put_bfqd_unlock(bfqd, &flags);
  583. + }
  584. + BUG_ON(!RB_EMPTY_ROOT(&st->active));
  585. + BUG_ON(!RB_EMPTY_ROOT(&st->idle));
  586. + }
  587. + BUG_ON(bfqg->sched_data.next_in_service != NULL);
  588. + BUG_ON(bfqg->sched_data.in_service_entity != NULL);
  589. +
  590. + /*
  591. + * We may race with device destruction, take extra care when
  592. + * dereferencing bfqg->bfqd.
  593. + */
  594. + bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
  595. + if (bfqd != NULL) {
  596. + hlist_del(&bfqg->bfqd_node);
  597. + __bfq_deactivate_entity(entity, 0);
  598. + bfq_put_async_queues(bfqd, bfqg);
  599. + bfq_put_bfqd_unlock(bfqd, &flags);
  600. + }
  601. + BUG_ON(entity->tree != NULL);
  602. +
  603. + /*
  604. + * No need to defer the kfree() to the end of the RCU grace
  605. + * period: we are called from the destroy() callback of our
  606. + * cgroup, so we can be sure that no one is a) still using
  607. + * this cgroup or b) doing lookups in it.
  608. + */
  609. + kfree(bfqg);
  610. +}
  611. +
  612. +static void bfq_end_wr_async(struct bfq_data *bfqd)
  613. +{
  614. + struct hlist_node *tmp;
  615. + struct bfq_group *bfqg;
  616. +
  617. + hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)
  618. + bfq_end_wr_async_queues(bfqd, bfqg);
  619. + bfq_end_wr_async_queues(bfqd, bfqd->root_group);
  620. +}
  621. +
  622. +/**
  623. + * bfq_disconnect_groups - disconnect @bfqd from all its groups.
  624. + * @bfqd: the device descriptor being exited.
  625. + *
  626. + * When the device exits we just make sure that no lookup can return
  627. + * the now unused group structures. They will be deallocated on cgroup
  628. + * destruction.
  629. + */
  630. +static void bfq_disconnect_groups(struct bfq_data *bfqd)
  631. +{
  632. + struct hlist_node *tmp;
  633. + struct bfq_group *bfqg;
  634. +
  635. + bfq_log(bfqd, "disconnect_groups beginning");
  636. + hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {
  637. + hlist_del(&bfqg->bfqd_node);
  638. +
  639. + __bfq_deactivate_entity(bfqg->my_entity, 0);
  640. +
  641. + /*
  642. + * Don't remove from the group hash, just set an
  643. + * invalid key. No lookups can race with the
  644. + * assignment as bfqd is being destroyed; this
  645. + * implies also that new elements cannot be added
  646. + * to the list.
  647. + */
  648. + rcu_assign_pointer(bfqg->bfqd, NULL);
  649. +
  650. + bfq_log(bfqd, "disconnect_groups: put async for group %p",
  651. + bfqg);
  652. + bfq_put_async_queues(bfqd, bfqg);
  653. + }
  654. +}
  655. +
  656. +static inline void bfq_free_root_group(struct bfq_data *bfqd)
  657. +{
  658. + struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
  659. + struct bfq_group *bfqg = bfqd->root_group;
  660. +
  661. + bfq_put_async_queues(bfqd, bfqg);
  662. +
  663. + spin_lock_irq(&bgrp->lock);
  664. + hlist_del_rcu(&bfqg->group_node);
  665. + spin_unlock_irq(&bgrp->lock);
  666. +
  667. + /*
  668. + * No need to synchronize_rcu() here: since the device is gone
  669. + * there cannot be any read-side access to its root_group.
  670. + */
  671. + kfree(bfqg);
  672. +}
  673. +
  674. +static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
  675. +{
  676. + struct bfq_group *bfqg;
  677. + struct bfqio_cgroup *bgrp;
  678. + int i;
  679. +
  680. + bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);
  681. + if (bfqg == NULL)
  682. + return NULL;
  683. +
  684. + bfqg->entity.parent = NULL;
  685. + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
  686. + bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
  687. +
  688. + bgrp = &bfqio_root_cgroup;
  689. + spin_lock_irq(&bgrp->lock);
  690. + rcu_assign_pointer(bfqg->bfqd, bfqd);
  691. + hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
  692. + spin_unlock_irq(&bgrp->lock);
  693. +
  694. + return bfqg;
  695. +}
  696. +
  697. +#define SHOW_FUNCTION(__VAR) \
  698. +static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \
  699. + struct cftype *cftype) \
  700. +{ \
  701. + struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
  702. + u64 ret = -ENODEV; \
  703. + \
  704. + mutex_lock(&bfqio_mutex); \
  705. + if (bfqio_is_removed(bgrp)) \
  706. + goto out_unlock; \
  707. + \
  708. + spin_lock_irq(&bgrp->lock); \
  709. + ret = bgrp->__VAR; \
  710. + spin_unlock_irq(&bgrp->lock); \
  711. + \
  712. +out_unlock: \
  713. + mutex_unlock(&bfqio_mutex); \
  714. + return ret; \
  715. +}
  716. +
  717. +SHOW_FUNCTION(weight);
  718. +SHOW_FUNCTION(ioprio);
  719. +SHOW_FUNCTION(ioprio_class);
  720. +#undef SHOW_FUNCTION
  721. +
  722. +#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
  723. +static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\
  724. + struct cftype *cftype, \
  725. + u64 val) \
  726. +{ \
  727. + struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
  728. + struct bfq_group *bfqg; \
  729. + int ret = -EINVAL; \
  730. + \
  731. + if (val < (__MIN) || val > (__MAX)) \
  732. + return ret; \
  733. + \
  734. + ret = -ENODEV; \
  735. + mutex_lock(&bfqio_mutex); \
  736. + if (bfqio_is_removed(bgrp)) \
  737. + goto out_unlock; \
  738. + ret = 0; \
  739. + \
  740. + spin_lock_irq(&bgrp->lock); \
  741. + bgrp->__VAR = (unsigned short)val; \
  742. + hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
  743. + /* \
  744. + * Setting the ioprio_changed flag of the entity \
  745. + * to 1 with new_##__VAR == ##__VAR would re-set \
  746. + * the value of the weight to its ioprio mapping. \
  747. + * Set the flag only if necessary. \
  748. + */ \
  749. + if ((unsigned short)val != bfqg->entity.new_##__VAR) { \
  750. + bfqg->entity.new_##__VAR = (unsigned short)val; \
  751. + /* \
  752. + * Make sure that the above new value has been \
  753. + * stored in bfqg->entity.new_##__VAR before \
  754. + * setting the ioprio_changed flag. In fact, \
  755. + * this flag may be read asynchronously (in \
  756. + * critical sections protected by a different \
  757. + * lock than that held here), and finding this \
  758. + * flag set may cause the execution of the code \
  759. + * for updating parameters whose value may \
  760. + * depend also on bfqg->entity.new_##__VAR (in \
  761. + * __bfq_entity_update_weight_prio). \
  762. + * This barrier makes sure that the new value \
  763. + * of bfqg->entity.new_##__VAR is correctly \
  764. + * seen in that code. \
  765. + */ \
  766. + smp_wmb(); \
  767. + bfqg->entity.ioprio_changed = 1; \
  768. + } \
  769. + } \
  770. + spin_unlock_irq(&bgrp->lock); \
  771. + \
  772. +out_unlock: \
  773. + mutex_unlock(&bfqio_mutex); \
  774. + return ret; \
  775. +}
  776. +
  777. +STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
  778. +STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
  779. +STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
  780. +#undef STORE_FUNCTION
  781. +
  782. +static struct cftype bfqio_files[] = {
  783. + {
  784. + .name = "weight",
  785. + .read_u64 = bfqio_cgroup_weight_read,
  786. + .write_u64 = bfqio_cgroup_weight_write,
  787. + },
  788. + {
  789. + .name = "ioprio",
  790. + .read_u64 = bfqio_cgroup_ioprio_read,
  791. + .write_u64 = bfqio_cgroup_ioprio_write,
  792. + },
  793. + {
  794. + .name = "ioprio_class",
  795. + .read_u64 = bfqio_cgroup_ioprio_class_read,
  796. + .write_u64 = bfqio_cgroup_ioprio_class_write,
  797. + },
  798. + { }, /* terminate */
  799. +};
  800. +
  801. +static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state
  802. + *parent_css)
  803. +{
  804. + struct bfqio_cgroup *bgrp;
  805. +
  806. + if (parent_css != NULL) {
  807. + bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
  808. + if (bgrp == NULL)
  809. + return ERR_PTR(-ENOMEM);
  810. + } else
  811. + bgrp = &bfqio_root_cgroup;
  812. +
  813. + spin_lock_init(&bgrp->lock);
  814. + INIT_HLIST_HEAD(&bgrp->group_data);
  815. + bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
  816. + bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
  817. +
  818. + return &bgrp->css;
  819. +}
  820. +
  821. +/*
  822. + * We cannot support shared io contexts, as we have no means to support
  823. + * two tasks with the same ioc in two different groups without major rework
  824. + * of the main bic/bfqq data structures. By now we allow a task to change
  825. + * its cgroup only if it's the only owner of its ioc; the drawback of this
  826. + * behavior is that a group containing a task that forked using CLONE_IO
  827. + * will not be destroyed until the tasks sharing the ioc die.
  828. + */
  829. +static int bfqio_can_attach(struct cgroup_subsys_state *css,
  830. + struct cgroup_taskset *tset)
  831. +{
  832. + struct task_struct *task;
  833. + struct io_context *ioc;
  834. + int ret = 0;
  835. +
  836. + cgroup_taskset_for_each(task, tset) {
  837. + /*
  838. + * task_lock() is needed to avoid races with
  839. + * exit_io_context()
  840. + */
  841. + task_lock(task);
  842. + ioc = task->io_context;
  843. + if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
  844. + /*
  845. + * ioc == NULL means that the task is either too
  846. + * young or exiting: if it has still no ioc the
  847. + * ioc can't be shared, if the task is exiting the
  848. + * attach will fail anyway, no matter what we
  849. + * return here.
  850. + */
  851. + ret = -EINVAL;
  852. + task_unlock(task);
  853. + if (ret)
  854. + break;
  855. + }
  856. +
  857. + return ret;
  858. +}
  859. +
  860. +static void bfqio_attach(struct cgroup_subsys_state *css,
  861. + struct cgroup_taskset *tset)
  862. +{
  863. + struct task_struct *task;
  864. + struct io_context *ioc;
  865. + struct io_cq *icq;
  866. +
  867. + /*
  868. + * IMPORTANT NOTE: The move of more than one process at a time to a
  869. + * new group has not yet been tested.
  870. + */
  871. + cgroup_taskset_for_each(task, tset) {
  872. + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
  873. + if (ioc) {
  874. + /*
  875. + * Handle cgroup change here.
  876. + */
  877. + rcu_read_lock();
  878. + hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
  879. + if (!strncmp(
  880. + icq->q->elevator->type->elevator_name,
  881. + "bfq", ELV_NAME_MAX))
  882. + bfq_bic_change_cgroup(icq_to_bic(icq),
  883. + css);
  884. + rcu_read_unlock();
  885. + put_io_context(ioc);
  886. + }
  887. + }
  888. +}
  889. +
  890. +static void bfqio_destroy(struct cgroup_subsys_state *css)
  891. +{
  892. + struct bfqio_cgroup *bgrp = css_to_bfqio(css);
  893. + struct hlist_node *tmp;
  894. + struct bfq_group *bfqg;
  895. +
  896. + /*
  897. + * Since we are destroying the cgroup, there are no more tasks
  898. + * referencing it, and all the RCU grace periods that may have
  899. + * referenced it are ended (as the destruction of the parent
  900. + * cgroup is RCU-safe); bgrp->group_data will not be accessed by
  901. + * anything else and we don't need any synchronization.
  902. + */
  903. + hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
  904. + bfq_destroy_group(bgrp, bfqg);
  905. +
  906. + BUG_ON(!hlist_empty(&bgrp->group_data));
  907. +
  908. + kfree(bgrp);
  909. +}
  910. +
  911. +static int bfqio_css_online(struct cgroup_subsys_state *css)
  912. +{
  913. + struct bfqio_cgroup *bgrp = css_to_bfqio(css);
  914. +
  915. + mutex_lock(&bfqio_mutex);
  916. + bgrp->online = true;
  917. + mutex_unlock(&bfqio_mutex);
  918. +
  919. + return 0;
  920. +}
  921. +
  922. +static void bfqio_css_offline(struct cgroup_subsys_state *css)
  923. +{
  924. + struct bfqio_cgroup *bgrp = css_to_bfqio(css);
  925. +
  926. + mutex_lock(&bfqio_mutex);
  927. + bgrp->online = false;
  928. + mutex_unlock(&bfqio_mutex);
  929. +}
  930. +
  931. +struct cgroup_subsys bfqio_cgrp_subsys = {
  932. + .css_alloc = bfqio_create,
  933. + .css_online = bfqio_css_online,
  934. + .css_offline = bfqio_css_offline,
  935. + .can_attach = bfqio_can_attach,
  936. + .attach = bfqio_attach,
  937. + .css_free = bfqio_destroy,
  938. + .base_cftypes = bfqio_files,
  939. +};
  940. +#else
  941. +static inline void bfq_init_entity(struct bfq_entity *entity,
  942. + struct bfq_group *bfqg)
  943. +{
  944. + entity->weight = entity->new_weight;
  945. + entity->orig_weight = entity->new_weight;
  946. + entity->ioprio = entity->new_ioprio;
  947. + entity->ioprio_class = entity->new_ioprio_class;
  948. + entity->sched_data = &bfqg->sched_data;
  949. +}
  950. +
  951. +static inline struct bfq_group *
  952. +bfq_bic_update_cgroup(struct bfq_io_cq *bic)
  953. +{
  954. + struct bfq_data *bfqd = bic_to_bfqd(bic);
  955. + return bfqd->root_group;
  956. +}
  957. +
  958. +static inline void bfq_bfqq_move(struct bfq_data *bfqd,
  959. + struct bfq_queue *bfqq,
  960. + struct bfq_entity *entity,
  961. + struct bfq_group *bfqg)
  962. +{
  963. +}
  964. +
  965. +static void bfq_end_wr_async(struct bfq_data *bfqd)
  966. +{
  967. + bfq_end_wr_async_queues(bfqd, bfqd->root_group);
  968. +}
  969. +
  970. +static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
  971. +{
  972. + bfq_put_async_queues(bfqd, bfqd->root_group);
  973. +}
  974. +
  975. +static inline void bfq_free_root_group(struct bfq_data *bfqd)
  976. +{
  977. + kfree(bfqd->root_group);
  978. +}
  979. +
  980. +static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
  981. +{
  982. + struct bfq_group *bfqg;
  983. + int i;
  984. +
  985. + bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
  986. + if (bfqg == NULL)
  987. + return NULL;
  988. +
  989. + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
  990. + bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
  991. +
  992. + return bfqg;
  993. +}
  994. +#endif
  995. diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
  996. new file mode 100644
  997. index 0000000..7f6b000
  998. --- /dev/null
  999. +++ b/block/bfq-ioc.c
  1000. @@ -0,0 +1,36 @@
  1001. +/*
  1002. + * BFQ: I/O context handling.
  1003. + *
  1004. + * Based on ideas and code from CFQ:
  1005. + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  1006. + *
  1007. + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  1008. + * Paolo Valente <paolo.valente@unimore.it>
  1009. + *
  1010. + * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
  1011. + */
  1012. +
  1013. +/**
  1014. + * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
  1015. + * @icq: the iocontext queue.
  1016. + */
  1017. +static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
  1018. +{
  1019. + /* bic->icq is the first member, %NULL will convert to %NULL */
  1020. + return container_of(icq, struct bfq_io_cq, icq);
  1021. +}
  1022. +
  1023. +/**
  1024. + * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
  1025. + * @bfqd: the lookup key.
  1026. + * @ioc: the io_context of the process doing I/O.
  1027. + *
  1028. + * Queue lock must be held.
  1029. + */
  1030. +static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
  1031. + struct io_context *ioc)
  1032. +{
  1033. + if (ioc)
  1034. + return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
  1035. + return NULL;
  1036. +}
  1037. diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
  1038. new file mode 100644
  1039. index 0000000..0a0891b
  1040. --- /dev/null
  1041. +++ b/block/bfq-iosched.c
  1042. @@ -0,0 +1,3617 @@
  1043. +/*
  1044. + * Budget Fair Queueing (BFQ) disk scheduler.
  1045. + *
  1046. + * Based on ideas and code from CFQ:
  1047. + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  1048. + *
  1049. + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  1050. + * Paolo Valente <paolo.valente@unimore.it>
  1051. + *
  1052. + * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
  1053. + *
  1054. + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
  1055. + * file.
  1056. + *
  1057. + * BFQ is a proportional-share storage-I/O scheduling algorithm based on
  1058. + * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
  1059. + * measured in number of sectors, to processes instead of time slices. The
  1060. + * device is not granted to the in-service process for a given time slice,
  1061. + * but until it has exhausted its assigned budget. This change from the time
  1062. + * to the service domain allows BFQ to distribute the device throughput
  1063. + * among processes as desired, without any distortion due to ZBR, workload
  1064. + * fluctuations or other factors. BFQ uses an ad hoc internal scheduler,
  1065. + * called B-WF2Q+, to schedule processes according to their budgets. More
  1066. + * precisely, BFQ schedules queues associated to processes. Thanks to the
  1067. + * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to
  1068. + * I/O-bound processes issuing sequential requests (to boost the
  1069. + * throughput), and yet guarantee a low latency to interactive and soft
  1070. + * real-time applications.
  1071. + *
  1072. + * BFQ is described in [1], where also a reference to the initial, more
  1073. + * theoretical paper on BFQ can be found. The interested reader can find
  1074. + * in the latter paper full details on the main algorithm, as well as
  1075. + * formulas of the guarantees and formal proofs of all the properties.
  1076. + * With respect to the version of BFQ presented in these papers, this
  1077. + * implementation adds a few more heuristics, such as the one that
  1078. + * guarantees a low latency to soft real-time applications, and a
  1079. + * hierarchical extension based on H-WF2Q+.
  1080. + *
  1081. + * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
  1082. + * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
  1083. + * complexity derives from the one introduced with EEVDF in [3].
  1084. + *
  1085. + * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
  1086. + * with the BFQ Disk I/O Scheduler'',
  1087. + * Proceedings of the 5th Annual International Systems and Storage
  1088. + * Conference (SYSTOR '12), June 2012.
  1089. + *
  1090. + * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
  1091. + *
  1092. + * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
  1093. + * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
  1094. + * Oct 1997.
  1095. + *
  1096. + * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
  1097. + *
  1098. + * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
  1099. + * First: A Flexible and Accurate Mechanism for Proportional Share
  1100. + * Resource Allocation,'' technical report.
  1101. + *
  1102. + * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
  1103. + */
  1104. +#include <linux/module.h>
  1105. +#include <linux/slab.h>
  1106. +#include <linux/blkdev.h>
  1107. +#include <linux/cgroup.h>
  1108. +#include <linux/elevator.h>
  1109. +#include <linux/jiffies.h>
  1110. +#include <linux/rbtree.h>
  1111. +#include <linux/ioprio.h>
  1112. +#include "bfq.h"
  1113. +#include "blk.h"
  1114. +
  1115. +/* Max number of dispatches in one round of service. */
  1116. +static const int bfq_quantum = 4;
  1117. +
  1118. +/* Expiration time of sync (0) and async (1) requests, in jiffies. */
  1119. +static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
  1120. +
  1121. +/* Maximum backwards seek, in KiB. */
  1122. +static const int bfq_back_max = 16 * 1024;
  1123. +
  1124. +/* Penalty of a backwards seek, in number of sectors. */
  1125. +static const int bfq_back_penalty = 2;
  1126. +
  1127. +/* Idling period duration, in jiffies. */
  1128. +static int bfq_slice_idle = HZ / 125;
  1129. +
  1130. +/* Default maximum budget values, in sectors and number of requests. */
  1131. +static const int bfq_default_max_budget = 16 * 1024;
  1132. +static const int bfq_max_budget_async_rq = 4;
  1133. +
  1134. +/*
  1135. + * Async to sync throughput distribution is controlled as follows:
  1136. + * when an async request is served, the entity is charged the number
  1137. + * of sectors of the request, multiplied by the factor below
  1138. + */
  1139. +static const int bfq_async_charge_factor = 10;
  1140. +
  1141. +/* Default timeout values, in jiffies, approximating CFQ defaults. */
  1142. +static const int bfq_timeout_sync = HZ / 8;
  1143. +static int bfq_timeout_async = HZ / 25;
  1144. +
  1145. +struct kmem_cache *bfq_pool;
  1146. +
  1147. +/* Below this threshold (in ms), we consider thinktime immediate. */
  1148. +#define BFQ_MIN_TT 2
  1149. +
  1150. +/* hw_tag detection: parallel requests threshold and min samples needed. */
  1151. +#define BFQ_HW_QUEUE_THRESHOLD 4
  1152. +#define BFQ_HW_QUEUE_SAMPLES 32
  1153. +
  1154. +#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
  1155. +#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
  1156. +
  1157. +/* Min samples used for peak rate estimation (for autotuning). */
  1158. +#define BFQ_PEAK_RATE_SAMPLES 32
  1159. +
  1160. +/* Shift used for peak rate fixed precision calculations. */
  1161. +#define BFQ_RATE_SHIFT 16
  1162. +
  1163. +/*
  1164. + * By default, BFQ computes the duration of the weight raising for
  1165. + * interactive applications automatically, using the following formula:
  1166. + * duration = (R / r) * T, where r is the peak rate of the device, and
  1167. + * R and T are two reference parameters.
  1168. + * In particular, R is the peak rate of the reference device (see below),
  1169. + * and T is a reference time: given the systems that are likely to be
  1170. + * installed on the reference device according to its speed class, T is
  1171. + * about the maximum time needed, under BFQ and while reading two files in
  1172. + * parallel, to load typical large applications on these systems.
  1173. + * In practice, the slower/faster the device at hand is, the more/less it
  1174. + * takes to load applications with respect to the reference device.
  1175. + * Accordingly, the longer/shorter BFQ grants weight raising to interactive
  1176. + * applications.
  1177. + *
  1178. + * BFQ uses four different reference pairs (R, T), depending on:
  1179. + * . whether the device is rotational or non-rotational;
  1180. + * . whether the device is slow, such as old or portable HDDs, as well as
  1181. + * SD cards, or fast, such as newer HDDs and SSDs.
  1182. + *
  1183. + * The device's speed class is dynamically (re)detected in
  1184. + * bfq_update_peak_rate() every time the estimated peak rate is updated.
  1185. + *
  1186. + * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0]
  1187. + * are the reference values for a slow/fast rotational device, whereas
  1188. + * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for
  1189. + * a slow/fast non-rotational device. Finally, device_speed_thresh are the
  1190. + * thresholds used to switch between speed classes.
  1191. + * Both the reference peak rates and the thresholds are measured in
  1192. + * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
  1193. + */
  1194. +static int R_slow[2] = {1536, 10752};
  1195. +static int R_fast[2] = {17415, 34791};
  1196. +/*
  1197. + * To improve readability, a conversion function is used to initialize the
  1198. + * following arrays, which entails that they can be initialized only in a
  1199. + * function.
  1200. + */
  1201. +static int T_slow[2];
  1202. +static int T_fast[2];
  1203. +static int device_speed_thresh[2];
  1204. +
  1205. +#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
  1206. + { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
  1207. +
  1208. +#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
  1209. +#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
  1210. +
  1211. +static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
  1212. +
  1213. +#include "bfq-ioc.c"
  1214. +#include "bfq-sched.c"
  1215. +#include "bfq-cgroup.c"
  1216. +
  1217. +#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
  1218. + IOPRIO_CLASS_IDLE)
  1219. +#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
  1220. + IOPRIO_CLASS_RT)
  1221. +
  1222. +#define bfq_sample_valid(samples) ((samples) > 80)
  1223. +
  1224. +/*
  1225. + * We regard a request as SYNC, if either it's a read or has the SYNC bit
  1226. + * set (in which case it could also be a direct WRITE).
  1227. + */
  1228. +static inline int bfq_bio_sync(struct bio *bio)
  1229. +{
  1230. + if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
  1231. + return 1;
  1232. +
  1233. + return 0;
  1234. +}
  1235. +
  1236. +/*
  1237. + * Scheduler run of queue, if there are requests pending and no one in the
  1238. + * driver that will restart queueing.
  1239. + */
  1240. +static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
  1241. +{
  1242. + if (bfqd->queued != 0) {
  1243. + bfq_log(bfqd, "schedule dispatch");
  1244. + kblockd_schedule_work(&bfqd->unplug_work);
  1245. + }
  1246. +}
  1247. +
  1248. +/*
  1249. + * Lifted from AS - choose which of rq1 and rq2 that is best served now.
  1250. + * We choose the request that is closesr to the head right now. Distance
  1251. + * behind the head is penalized and only allowed to a certain extent.
  1252. + */
  1253. +static struct request *bfq_choose_req(struct bfq_data *bfqd,
  1254. + struct request *rq1,
  1255. + struct request *rq2,
  1256. + sector_t last)
  1257. +{
  1258. + sector_t s1, s2, d1 = 0, d2 = 0;
  1259. + unsigned long back_max;
  1260. +#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
  1261. +#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
  1262. + unsigned wrap = 0; /* bit mask: requests behind the disk head? */
  1263. +
  1264. + if (rq1 == NULL || rq1 == rq2)
  1265. + return rq2;
  1266. + if (rq2 == NULL)
  1267. + return rq1;
  1268. +
  1269. + if (rq_is_sync(rq1) && !rq_is_sync(rq2))
  1270. + return rq1;
  1271. + else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
  1272. + return rq2;
  1273. + if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
  1274. + return rq1;
  1275. + else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
  1276. + return rq2;
  1277. +
  1278. + s1 = blk_rq_pos(rq1);
  1279. + s2 = blk_rq_pos(rq2);
  1280. +
  1281. + /*
  1282. + * By definition, 1KiB is 2 sectors.
  1283. + */
  1284. + back_max = bfqd->bfq_back_max * 2;
  1285. +
  1286. + /*
  1287. + * Strict one way elevator _except_ in the case where we allow
  1288. + * short backward seeks which are biased as twice the cost of a
  1289. + * similar forward seek.
  1290. + */
  1291. + if (s1 >= last)
  1292. + d1 = s1 - last;
  1293. + else if (s1 + back_max >= last)
  1294. + d1 = (last - s1) * bfqd->bfq_back_penalty;
  1295. + else
  1296. + wrap |= BFQ_RQ1_WRAP;
  1297. +
  1298. + if (s2 >= last)
  1299. + d2 = s2 - last;
  1300. + else if (s2 + back_max >= last)
  1301. + d2 = (last - s2) * bfqd->bfq_back_penalty;
  1302. + else
  1303. + wrap |= BFQ_RQ2_WRAP;
  1304. +
  1305. + /* Found required data */
  1306. +
  1307. + /*
  1308. + * By doing switch() on the bit mask "wrap" we avoid having to
  1309. + * check two variables for all permutations: --> faster!
  1310. + */
  1311. + switch (wrap) {
  1312. + case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
  1313. + if (d1 < d2)
  1314. + return rq1;
  1315. + else if (d2 < d1)
  1316. + return rq2;
  1317. + else {
  1318. + if (s1 >= s2)
  1319. + return rq1;
  1320. + else
  1321. + return rq2;
  1322. + }
  1323. +
  1324. + case BFQ_RQ2_WRAP:
  1325. + return rq1;
  1326. + case BFQ_RQ1_WRAP:
  1327. + return rq2;
  1328. + case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
  1329. + default:
  1330. + /*
  1331. + * Since both rqs are wrapped,
  1332. + * start with the one that's further behind head
  1333. + * (--> only *one* back seek required),
  1334. + * since back seek takes more time than forward.
  1335. + */
  1336. + if (s1 <= s2)
  1337. + return rq1;
  1338. + else
  1339. + return rq2;
  1340. + }
  1341. +}
  1342. +
  1343. +static struct bfq_queue *
  1344. +bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
  1345. + sector_t sector, struct rb_node **ret_parent,
  1346. + struct rb_node ***rb_link)
  1347. +{
  1348. + struct rb_node **p, *parent;
  1349. + struct bfq_queue *bfqq = NULL;
  1350. +
  1351. + parent = NULL;
  1352. + p = &root->rb_node;
  1353. + while (*p) {
  1354. + struct rb_node **n;
  1355. +
  1356. + parent = *p;
  1357. + bfqq = rb_entry(parent, struct bfq_queue, pos_node);
  1358. +
  1359. + /*
  1360. + * Sort strictly based on sector. Smallest to the left,
  1361. + * largest to the right.
  1362. + */
  1363. + if (sector > blk_rq_pos(bfqq->next_rq))
  1364. + n = &(*p)->rb_right;
  1365. + else if (sector < blk_rq_pos(bfqq->next_rq))
  1366. + n = &(*p)->rb_left;
  1367. + else
  1368. + break;
  1369. + p = n;
  1370. + bfqq = NULL;
  1371. + }
  1372. +
  1373. + *ret_parent = parent;
  1374. + if (rb_link)
  1375. + *rb_link = p;
  1376. +
  1377. + bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
  1378. + (long long unsigned)sector,
  1379. + bfqq != NULL ? bfqq->pid : 0);
  1380. +
  1381. + return bfqq;
  1382. +}
  1383. +
  1384. +static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
  1385. +{
  1386. + struct rb_node **p, *parent;
  1387. + struct bfq_queue *__bfqq;
  1388. +
  1389. + if (bfqq->pos_root != NULL) {
  1390. + rb_erase(&bfqq->pos_node, bfqq->pos_root);
  1391. + bfqq->pos_root = NULL;
  1392. + }
  1393. +
  1394. + if (bfq_class_idle(bfqq))
  1395. + return;
  1396. + if (!bfqq->next_rq)
  1397. + return;
  1398. +
  1399. + bfqq->pos_root = &bfqd->rq_pos_tree;
  1400. + __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
  1401. + blk_rq_pos(bfqq->next_rq), &parent, &p);
  1402. + if (__bfqq == NULL) {
  1403. + rb_link_node(&bfqq->pos_node, parent, p);
  1404. + rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
  1405. + } else
  1406. + bfqq->pos_root = NULL;
  1407. +}
  1408. +
  1409. +/*
  1410. + * Tell whether there are active queues or groups with differentiated weights.
  1411. + */
  1412. +static inline bool bfq_differentiated_weights(struct bfq_data *bfqd)
  1413. +{
  1414. + BUG_ON(!bfqd->hw_tag);
  1415. + /*
  1416. + * For weights to differ, at least one of the trees must contain
  1417. + * at least two nodes.
  1418. + */
  1419. + return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
  1420. + (bfqd->queue_weights_tree.rb_node->rb_left ||
  1421. + bfqd->queue_weights_tree.rb_node->rb_right)
  1422. +#ifdef CONFIG_CGROUP_BFQIO
  1423. + ) ||
  1424. + (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
  1425. + (bfqd->group_weights_tree.rb_node->rb_left ||
  1426. + bfqd->group_weights_tree.rb_node->rb_right)
  1427. +#endif
  1428. + );
  1429. +}
  1430. +
  1431. +/*
  1432. + * If the weight-counter tree passed as input contains no counter for
  1433. + * the weight of the input entity, then add that counter; otherwise just
  1434. + * increment the existing counter.
  1435. + *
  1436. + * Note that weight-counter trees contain few nodes in mostly symmetric
  1437. + * scenarios. For example, if all queues have the same weight, then the
  1438. + * weight-counter tree for the queues may contain at most one node.
  1439. + * This holds even if low_latency is on, because weight-raised queues
  1440. + * are not inserted in the tree.
  1441. + * In most scenarios, the rate at which nodes are created/destroyed
  1442. + * should be low too.
  1443. + */
  1444. +static void bfq_weights_tree_add(struct bfq_data *bfqd,
  1445. + struct bfq_entity *entity,
  1446. + struct rb_root *root)
  1447. +{
  1448. + struct rb_node **new = &(root->rb_node), *parent = NULL;
  1449. +
  1450. + /*
  1451. + * Do not insert if:
  1452. + * - the device does not support queueing;
  1453. + * - the entity is already associated with a counter, which happens if:
  1454. + * 1) the entity is associated with a queue, 2) a request arrival
  1455. + * has caused the queue to become both non-weight-raised, and hence
  1456. + * change its weight, and backlogged; in this respect, each
  1457. + * of the two events causes an invocation of this function,
  1458. + * 3) this is the invocation of this function caused by the second
  1459. + * event. This second invocation is actually useless, and we handle
  1460. + * this fact by exiting immediately. More efficient or clearer
  1461. + * solutions might possibly be adopted.
  1462. + */
  1463. + if (!bfqd->hw_tag || entity->weight_counter)
  1464. + return;
  1465. +
  1466. + while (*new) {
  1467. + struct bfq_weight_counter *__counter = container_of(*new,
  1468. + struct bfq_weight_counter,
  1469. + weights_node);
  1470. + parent = *new;
  1471. +
  1472. + if (entity->weight == __counter->weight) {
  1473. + entity->weight_counter = __counter;
  1474. + goto inc_counter;
  1475. + }
  1476. + if (entity->weight < __counter->weight)
  1477. + new = &((*new)->rb_left);
  1478. + else
  1479. + new = &((*new)->rb_right);
  1480. + }
  1481. +
  1482. + entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
  1483. + GFP_ATOMIC);
  1484. + entity->weight_counter->weight = entity->weight;
  1485. + rb_link_node(&entity->weight_counter->weights_node, parent, new);
  1486. + rb_insert_color(&entity->weight_counter->weights_node, root);
  1487. +
  1488. +inc_counter:
  1489. + entity->weight_counter->num_active++;
  1490. +}
  1491. +
  1492. +/*
  1493. + * Decrement the weight counter associated with the entity, and, if the
  1494. + * counter reaches 0, remove the counter from the tree.
  1495. + * See the comments to the function bfq_weights_tree_add() for considerations
  1496. + * about overhead.
  1497. + */
  1498. +static void bfq_weights_tree_remove(struct bfq_data *bfqd,
  1499. + struct bfq_entity *entity,
  1500. + struct rb_root *root)
  1501. +{
  1502. + /*
  1503. + * Check whether the entity is actually associated with a counter.
  1504. + * In fact, the device may not be considered NCQ-capable for a while,
  1505. + * which implies that no insertion in the weight trees is performed,
  1506. + * after which the device may start to be deemed NCQ-capable, and hence
  1507. + * this function may start to be invoked. This may cause the function
  1508. + * to be invoked for entities that are not associated with any counter.
  1509. + */
  1510. + if (!entity->weight_counter)
  1511. + return;
  1512. +
  1513. + BUG_ON(RB_EMPTY_ROOT(root));
  1514. + BUG_ON(entity->weight_counter->weight != entity->weight);
  1515. +
  1516. + BUG_ON(!entity->weight_counter->num_active);
  1517. + entity->weight_counter->num_active--;
  1518. + if (entity->weight_counter->num_active > 0)
  1519. + goto reset_entity_pointer;
  1520. +
  1521. + rb_erase(&entity->weight_counter->weights_node, root);
  1522. + kfree(entity->weight_counter);
  1523. +
  1524. +reset_entity_pointer:
  1525. + entity->weight_counter = NULL;
  1526. +}
  1527. +
  1528. +static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
  1529. + struct bfq_queue *bfqq,
  1530. + struct request *last)
  1531. +{
  1532. + struct rb_node *rbnext = rb_next(&last->rb_node);
  1533. + struct rb_node *rbprev = rb_prev(&last->rb_node);
  1534. + struct request *next = NULL, *prev = NULL;
  1535. +
  1536. + BUG_ON(RB_EMPTY_NODE(&last->rb_node));
  1537. +
  1538. + if (rbprev != NULL)
  1539. + prev = rb_entry_rq(rbprev);
  1540. +
  1541. + if (rbnext != NULL)
  1542. + next = rb_entry_rq(rbnext);
  1543. + else {
  1544. + rbnext = rb_first(&bfqq->sort_list);
  1545. + if (rbnext && rbnext != &last->rb_node)
  1546. + next = rb_entry_rq(rbnext);
  1547. + }
  1548. +
  1549. + return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
  1550. +}
  1551. +
  1552. +/* see the definition of bfq_async_charge_factor for details */
  1553. +static inline unsigned long bfq_serv_to_charge(struct request *rq,
  1554. + struct bfq_queue *bfqq)
  1555. +{
  1556. + return blk_rq_sectors(rq) *
  1557. + (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *
  1558. + bfq_async_charge_factor));
  1559. +}
  1560. +
  1561. +/**
  1562. + * bfq_updated_next_req - update the queue after a new next_rq selection.
  1563. + * @bfqd: the device data the queue belongs to.
  1564. + * @bfqq: the queue to update.
  1565. + *
  1566. + * If the first request of a queue changes we make sure that the queue
  1567. + * has enough budget to serve at least its first request (if the
  1568. + * request has grown). We do this because if the queue has not enough
  1569. + * budget for its first request, it has to go through two dispatch
  1570. + * rounds to actually get it dispatched.
  1571. + */
  1572. +static void bfq_updated_next_req(struct bfq_data *bfqd,
  1573. + struct bfq_queue *bfqq)
  1574. +{
  1575. + struct bfq_entity *entity = &bfqq->entity;
  1576. + struct bfq_service_tree *st = bfq_entity_service_tree(entity);
  1577. + struct request *next_rq = bfqq->next_rq;
  1578. + unsigned long new_budget;
  1579. +
  1580. + if (next_rq == NULL)
  1581. + return;
  1582. +
  1583. + if (bfqq == bfqd->in_service_queue)
  1584. + /*
  1585. + * In order not to break guarantees, budgets cannot be
  1586. + * changed after an entity has been selected.
  1587. + */
  1588. + return;
  1589. +
  1590. + BUG_ON(entity->tree != &st->active);
  1591. + BUG_ON(entity == entity->sched_data->in_service_entity);
  1592. +
  1593. + new_budget = max_t(unsigned long, bfqq->max_budget,
  1594. + bfq_serv_to_charge(next_rq, bfqq));
  1595. + if (entity->budget != new_budget) {
  1596. + entity->budget = new_budget;
  1597. + bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
  1598. + new_budget);
  1599. + bfq_activate_bfqq(bfqd, bfqq);
  1600. + }
  1601. +}
  1602. +
  1603. +static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd)
  1604. +{
  1605. + u64 dur;
  1606. +
  1607. + if (bfqd->bfq_wr_max_time > 0)
  1608. + return bfqd->bfq_wr_max_time;
  1609. +
  1610. + dur = bfqd->RT_prod;
  1611. + do_div(dur, bfqd->peak_rate);
  1612. +
  1613. + return dur;
  1614. +}
  1615. +
  1616. +static void bfq_add_request(struct request *rq)
  1617. +{
  1618. + struct bfq_queue *bfqq = RQ_BFQQ(rq);
  1619. + struct bfq_entity *entity = &bfqq->entity;
  1620. + struct bfq_data *bfqd = bfqq->bfqd;
  1621. + struct request *next_rq, *prev;
  1622. + unsigned long old_wr_coeff = bfqq->wr_coeff;
  1623. + int idle_for_long_time = 0;
  1624. +
  1625. + bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
  1626. + bfqq->queued[rq_is_sync(rq)]++;
  1627. + bfqd->queued++;
  1628. +
  1629. + elv_rb_add(&bfqq->sort_list, rq);
  1630. +
  1631. + /*
  1632. + * Check if this request is a better next-serve candidate.
  1633. + */
  1634. + prev = bfqq->next_rq;
  1635. + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
  1636. + BUG_ON(next_rq == NULL);
  1637. + bfqq->next_rq = next_rq;
  1638. +
  1639. + /*
  1640. + * Adjust priority tree position, if next_rq changes.
  1641. + */
  1642. + if (prev != bfqq->next_rq)
  1643. + bfq_rq_pos_tree_add(bfqd, bfqq);
  1644. +
  1645. + if (!bfq_bfqq_busy(bfqq)) {
  1646. + int soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
  1647. + time_is_before_jiffies(bfqq->soft_rt_next_start);
  1648. + idle_for_long_time = time_is_before_jiffies(
  1649. + bfqq->budget_timeout +
  1650. + bfqd->bfq_wr_min_idle_time);
  1651. + entity->budget = max_t(unsigned long, bfqq->max_budget,
  1652. + bfq_serv_to_charge(next_rq, bfqq));
  1653. +
  1654. + if (!bfq_bfqq_IO_bound(bfqq)) {
  1655. + if (time_before(jiffies,
  1656. + RQ_BIC(rq)->ttime.last_end_request +
  1657. + bfqd->bfq_slice_idle)) {
  1658. + bfqq->requests_within_timer++;
  1659. + if (bfqq->requests_within_timer >=
  1660. + bfqd->bfq_requests_within_timer)
  1661. + bfq_mark_bfqq_IO_bound(bfqq);
  1662. + } else
  1663. + bfqq->requests_within_timer = 0;
  1664. + }
  1665. +
  1666. + if (!bfqd->low_latency)
  1667. + goto add_bfqq_busy;
  1668. +
  1669. + /*
  1670. + * If the queue is not being boosted and has been idle
  1671. + * for enough time, start a weight-raising period
  1672. + */
  1673. + if (old_wr_coeff == 1 && (idle_for_long_time || soft_rt)) {
  1674. + bfqq->wr_coeff = bfqd->bfq_wr_coeff;
  1675. + if (idle_for_long_time)
  1676. + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
  1677. + else
  1678. + bfqq->wr_cur_max_time =
  1679. + bfqd->bfq_wr_rt_max_time;
  1680. + bfq_log_bfqq(bfqd, bfqq,
  1681. + "wrais starting at %lu, rais_max_time %u",
  1682. + jiffies,
  1683. + jiffies_to_msecs(bfqq->wr_cur_max_time));
  1684. + } else if (old_wr_coeff > 1) {
  1685. + if (idle_for_long_time)
  1686. + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
  1687. + else if (bfqq->wr_cur_max_time ==
  1688. + bfqd->bfq_wr_rt_max_time &&
  1689. + !soft_rt) {
  1690. + bfqq->wr_coeff = 1;
  1691. + bfq_log_bfqq(bfqd, bfqq,
  1692. + "wrais ending at %lu, rais_max_time %u",
  1693. + jiffies,
  1694. + jiffies_to_msecs(bfqq->
  1695. + wr_cur_max_time));
  1696. + } else if (time_before(
  1697. + bfqq->last_wr_start_finish +
  1698. + bfqq->wr_cur_max_time,
  1699. + jiffies +
  1700. + bfqd->bfq_wr_rt_max_time) &&
  1701. + soft_rt) {
  1702. + /*
  1703. + *
  1704. + * The remaining weight-raising time is lower
  1705. + * than bfqd->bfq_wr_rt_max_time, which
  1706. + * means that the application is enjoying
  1707. + * weight raising either because deemed soft-
  1708. + * rt in the near past, or because deemed
  1709. + * interactive a long ago. In both cases,
  1710. + * resetting now the current remaining weight-
  1711. + * raising time for the application to the
  1712. + * weight-raising duration for soft rt
  1713. + * applications would not cause any latency
  1714. + * increase for the application (as the new
  1715. + * duration would be higher than the remaining
  1716. + * time).
  1717. + *
  1718. + * In addition, the application is now meeting
  1719. + * the requirements for being deemed soft rt.
  1720. + * In the end we can correctly and safely
  1721. + * (re)charge the weight-raising duration for
  1722. + * the application with the weight-raising
  1723. + * duration for soft rt applications.
  1724. + *
  1725. + * In particular, doing this recharge now, i.e.,
  1726. + * before the weight-raising period for the
  1727. + * application finishes, reduces the probability
  1728. + * of the following negative scenario:
  1729. + * 1) the weight of a soft rt application is
  1730. + * raised at startup (as for any newly
  1731. + * created application),
  1732. + * 2) since the application is not interactive,
  1733. + * at a certain time weight-raising is
  1734. + * stopped for the application,
  1735. + * 3) at that time the application happens to
  1736. + * still have pending requests, and hence
  1737. + * is destined to not have a chance to be
  1738. + * deemed soft rt before these requests are
  1739. + * completed (see the comments to the
  1740. + * function bfq_bfqq_softrt_next_start()
  1741. + * for details on soft rt detection),
  1742. + * 4) these pending requests experience a high
  1743. + * latency because the application is not
  1744. + * weight-raised while they are pending.
  1745. + */
  1746. + bfqq->last_wr_start_finish = jiffies;
  1747. + bfqq->wr_cur_max_time =
  1748. + bfqd->bfq_wr_rt_max_time;
  1749. + }
  1750. + }
  1751. + if (old_wr_coeff != bfqq->wr_coeff)
  1752. + entity->ioprio_changed = 1;
  1753. +add_bfqq_busy:
  1754. + bfqq->last_idle_bklogged = jiffies;
  1755. + bfqq->service_from_backlogged = 0;
  1756. + bfq_clear_bfqq_softrt_update(bfqq);
  1757. + bfq_add_bfqq_busy(bfqd, bfqq);
  1758. + } else {
  1759. + if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
  1760. + time_is_before_jiffies(
  1761. + bfqq->last_wr_start_finish +
  1762. + bfqd->bfq_wr_min_inter_arr_async)) {
  1763. + bfqq->wr_coeff = bfqd->bfq_wr_coeff;
  1764. + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
  1765. +
  1766. + bfqd->wr_busy_queues++;
  1767. + entity->ioprio_changed = 1;
  1768. + bfq_log_bfqq(bfqd, bfqq,
  1769. + "non-idle wrais starting at %lu, rais_max_time %u",
  1770. + jiffies,
  1771. + jiffies_to_msecs(bfqq->wr_cur_max_time));
  1772. + }
  1773. + if (prev != bfqq->next_rq)
  1774. + bfq_updated_next_req(bfqd, bfqq);
  1775. + }
  1776. +
  1777. + if (bfqd->low_latency &&
  1778. + (old_wr_coeff == 1 || bfqq->wr_coeff == 1 ||
  1779. + idle_for_long_time))
  1780. + bfqq->last_wr_start_finish = jiffies;
  1781. +}
  1782. +
  1783. +static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
  1784. + struct bio *bio)
  1785. +{
  1786. + struct task_struct *tsk = current;
  1787. + struct bfq_io_cq *bic;
  1788. + struct bfq_queue *bfqq;
  1789. +
  1790. + bic = bfq_bic_lookup(bfqd, tsk->io_context);
  1791. + if (bic == NULL)
  1792. + return NULL;
  1793. +
  1794. + bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
  1795. + if (bfqq != NULL)
  1796. + return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
  1797. +
  1798. + return NULL;
  1799. +}
  1800. +
  1801. +static void bfq_activate_request(struct request_queue *q, struct request *rq)
  1802. +{
  1803. + struct bfq_data *bfqd = q->elevator->elevator_data;
  1804. +
  1805. + bfqd->rq_in_driver++;
  1806. + bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
  1807. + bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
  1808. + (long long unsigned)bfqd->last_position);
  1809. +}
  1810. +
  1811. +static inline void bfq_deactivate_request(struct request_queue *q,
  1812. + struct request *rq)
  1813. +{
  1814. + struct bfq_data *bfqd = q->elevator->elevator_data;
  1815. +
  1816. + BUG_ON(bfqd->rq_in_driver == 0);
  1817. + bfqd->rq_in_driver--;
  1818. +}
  1819. +
  1820. +static void bfq_remove_request(struct request *rq)
  1821. +{
  1822. + struct bfq_queue *bfqq = RQ_BFQQ(rq);
  1823. + struct bfq_data *bfqd = bfqq->bfqd;
  1824. + const int sync = rq_is_sync(rq);
  1825. +
  1826. + if (bfqq->next_rq == rq) {
  1827. + bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
  1828. + bfq_updated_next_req(bfqd, bfqq);
  1829. + }
  1830. +
  1831. + list_del_init(&rq->queuelist);
  1832. + BUG_ON(bfqq->queued[sync] == 0);
  1833. + bfqq->queued[sync]--;
  1834. + bfqd->queued--;
  1835. + elv_rb_del(&bfqq->sort_list, rq);
  1836. +
  1837. + if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
  1838. + if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)
  1839. + bfq_del_bfqq_busy(bfqd, bfqq, 1);
  1840. + /*
  1841. + * Remove queue from request-position tree as it is empty.
  1842. + */
  1843. + if (bfqq->pos_root != NULL) {
  1844. + rb_erase(&bfqq->pos_node, bfqq->pos_root);
  1845. + bfqq->pos_root = NULL;
  1846. + }
  1847. + }
  1848. +
  1849. + if (rq->cmd_flags & REQ_META) {
  1850. + BUG_ON(bfqq->meta_pending == 0);
  1851. + bfqq->meta_pending--;
  1852. + }
  1853. +}
  1854. +
  1855. +static int bfq_merge(struct request_queue *q, struct request **req,
  1856. + struct bio *bio)
  1857. +{
  1858. + struct bfq_data *bfqd = q->elevator->elevator_data;
  1859. + struct request *__rq;
  1860. +
  1861. + __rq = bfq_find_rq_fmerge(bfqd, bio);
  1862. + if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
  1863. + *req = __rq;
  1864. + return ELEVATOR_FRONT_MERGE;
  1865. + }
  1866. +
  1867. + return ELEVATOR_NO_MERGE;
  1868. +}
  1869. +
  1870. +static void bfq_merged_request(struct request_queue *q, struct request *req,
  1871. + int type)
  1872. +{
  1873. + if (type == ELEVATOR_FRONT_MERGE &&
  1874. + rb_prev(&req->rb_node) &&
  1875. + blk_rq_pos(req) <
  1876. + blk_rq_pos(container_of(rb_prev(&req->rb_node),
  1877. + struct request, rb_node))) {
  1878. + struct bfq_queue *bfqq = RQ_BFQQ(req);
  1879. + struct bfq_data *bfqd = bfqq->bfqd;
  1880. + struct request *prev, *next_rq;
  1881. +
  1882. + /* Reposition request in its sort_list */
  1883. + elv_rb_del(&bfqq->sort_list, req);
  1884. + elv_rb_add(&bfqq->sort_list, req);
  1885. + /* Choose next request to be served for bfqq */
  1886. + prev = bfqq->next_rq;
  1887. + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
  1888. + bfqd->last_position);
  1889. + BUG_ON(next_rq == NULL);
  1890. + bfqq->next_rq = next_rq;
  1891. + /*
  1892. + * If next_rq changes, update both the queue's budget to
  1893. + * fit the new request and the queue's position in its
  1894. + * rq_pos_tree.
  1895. + */
  1896. + if (prev != bfqq->next_rq) {
  1897. + bfq_updated_next_req(bfqd, bfqq);
  1898. + bfq_rq_pos_tree_add(bfqd, bfqq);
  1899. + }
  1900. + }
  1901. +}
  1902. +
  1903. +static void bfq_merged_requests(struct request_queue *q, struct request *rq,
  1904. + struct request *next)
  1905. +{
  1906. + struct bfq_queue *bfqq = RQ_BFQQ(rq);
  1907. +
  1908. + /*
  1909. + * Reposition in fifo if next is older than rq.
  1910. + */
  1911. + if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
  1912. + time_before(next->fifo_time, rq->fifo_time)) {
  1913. + list_move(&rq->queuelist, &next->queuelist);
  1914. + rq->fifo_time = next->fifo_time;
  1915. + }
  1916. +
  1917. + if (bfqq->next_rq == next)
  1918. + bfqq->next_rq = rq;
  1919. +
  1920. + bfq_remove_request(next);
  1921. +}
  1922. +
  1923. +/* Must be called with bfqq != NULL */
  1924. +static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
  1925. +{
  1926. + BUG_ON(bfqq == NULL);
  1927. + if (bfq_bfqq_busy(bfqq))
  1928. + bfqq->bfqd->wr_busy_queues--;
  1929. + bfqq->wr_coeff = 1;
  1930. + bfqq->wr_cur_max_time = 0;
  1931. + /* Trigger a weight change on the next activation of the queue */
  1932. + bfqq->entity.ioprio_changed = 1;
  1933. +}
  1934. +
  1935. +static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
  1936. + struct bfq_group *bfqg)
  1937. +{
  1938. + int i, j;
  1939. +
  1940. + for (i = 0; i < 2; i++)
  1941. + for (j = 0; j < IOPRIO_BE_NR; j++)
  1942. + if (bfqg->async_bfqq[i][j] != NULL)
  1943. + bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
  1944. + if (bfqg->async_idle_bfqq != NULL)
  1945. + bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
  1946. +}
  1947. +
  1948. +static void bfq_end_wr(struct bfq_data *bfqd)
  1949. +{
  1950. + struct bfq_queue *bfqq;
  1951. +
  1952. + spin_lock_irq(bfqd->queue->queue_lock);
  1953. +
  1954. + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
  1955. + bfq_bfqq_end_wr(bfqq);
  1956. + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
  1957. + bfq_bfqq_end_wr(bfqq);
  1958. + bfq_end_wr_async(bfqd);
  1959. +
  1960. + spin_unlock_irq(bfqd->queue->queue_lock);
  1961. +}
  1962. +
  1963. +static int bfq_allow_merge(struct request_queue *q, struct request *rq,
  1964. + struct bio *bio)
  1965. +{
  1966. + struct bfq_data *bfqd = q->elevator->elevator_data;
  1967. + struct bfq_io_cq *bic;
  1968. + struct bfq_queue *bfqq;
  1969. +
  1970. + /*
  1971. + * Disallow merge of a sync bio into an async request.
  1972. + */
  1973. + if (bfq_bio_sync(bio) && !rq_is_sync(rq))
  1974. + return 0;
  1975. +
  1976. + /*
  1977. + * Lookup the bfqq that this bio will be queued with. Allow
  1978. + * merge only if rq is queued there.
  1979. + * Queue lock is held here.
  1980. + */
  1981. + bic = bfq_bic_lookup(bfqd, current->io_context);
  1982. + if (bic == NULL)
  1983. + return 0;
  1984. +
  1985. + bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
  1986. + return bfqq == RQ_BFQQ(rq);
  1987. +}
  1988. +
  1989. +static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
  1990. + struct bfq_queue *bfqq)
  1991. +{
  1992. + if (bfqq != NULL) {
  1993. + bfq_mark_bfqq_must_alloc(bfqq);
  1994. + bfq_mark_bfqq_budget_new(bfqq);
  1995. + bfq_clear_bfqq_fifo_expire(bfqq);
  1996. +
  1997. + bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
  1998. +
  1999. + bfq_log_bfqq(bfqd, bfqq,
  2000. + "set_in_service_queue, cur-budget = %lu",
  2001. + bfqq->entity.budget);
  2002. + }
  2003. +
  2004. + bfqd->in_service_queue = bfqq;
  2005. +}
  2006. +
  2007. +/*
  2008. + * Get and set a new queue for service.
  2009. + */
  2010. +static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,
  2011. + struct bfq_queue *bfqq)
  2012. +{
  2013. + if (!bfqq)
  2014. + bfqq = bfq_get_next_queue(bfqd);
  2015. + else
  2016. + bfq_get_next_queue_forced(bfqd, bfqq);
  2017. +
  2018. + __bfq_set_in_service_queue(bfqd, bfqq);
  2019. + return bfqq;
  2020. +}
  2021. +
  2022. +static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
  2023. + struct request *rq)
  2024. +{
  2025. + if (blk_rq_pos(rq) >= bfqd->last_position)
  2026. + return blk_rq_pos(rq) - bfqd->last_position;
  2027. + else
  2028. + return bfqd->last_position - blk_rq_pos(rq);
  2029. +}
  2030. +
  2031. +/*
  2032. + * Return true if bfqq has no request pending and rq is close enough to
  2033. + * bfqd->last_position, or if rq is closer to bfqd->last_position than
  2034. + * bfqq->next_rq
  2035. + */
  2036. +static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
  2037. +{
  2038. + return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
  2039. +}
  2040. +
  2041. +static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
  2042. +{
  2043. + struct rb_root *root = &bfqd->rq_pos_tree;
  2044. + struct rb_node *parent, *node;
  2045. + struct bfq_queue *__bfqq;
  2046. + sector_t sector = bfqd->last_position;
  2047. +
  2048. + if (RB_EMPTY_ROOT(root))
  2049. + return NULL;
  2050. +
  2051. + /*
  2052. + * First, if we find a request starting at the end of the last
  2053. + * request, choose it.
  2054. + */
  2055. + __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
  2056. + if (__bfqq != NULL)
  2057. + return __bfqq;
  2058. +
  2059. + /*
  2060. + * If the exact sector wasn't found, the parent of the NULL leaf
  2061. + * will contain the closest sector (rq_pos_tree sorted by
  2062. + * next_request position).
  2063. + */
  2064. + __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
  2065. + if (bfq_rq_close(bfqd, __bfqq->next_rq))
  2066. + return __bfqq;
  2067. +
  2068. + if (blk_rq_pos(__bfqq->next_rq) < sector)
  2069. + node = rb_next(&__bfqq->pos_node);
  2070. + else
  2071. + node = rb_prev(&__bfqq->pos_node);
  2072. + if (node == NULL)
  2073. + return NULL;
  2074. +
  2075. + __bfqq = rb_entry(node, struct bfq_queue, pos_node);
  2076. + if (bfq_rq_close(bfqd, __bfqq->next_rq))
  2077. + return __bfqq;
  2078. +
  2079. + return NULL;
  2080. +}
  2081. +
  2082. +/*
  2083. + * bfqd - obvious
  2084. + * cur_bfqq - passed in so that we don't decide that the current queue
  2085. + * is closely cooperating with itself.
  2086. + *
  2087. + * We are assuming that cur_bfqq has dispatched at least one request,
  2088. + * and that bfqd->last_position reflects a position on the disk associated
  2089. + * with the I/O issued by cur_bfqq.
  2090. + */
  2091. +static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
  2092. + struct bfq_queue *cur_bfqq)
  2093. +{
  2094. + struct bfq_queue *bfqq;
  2095. +
  2096. + if (bfq_class_idle(cur_bfqq))
  2097. + return NULL;
  2098. + if (!bfq_bfqq_sync(cur_bfqq))
  2099. + return NULL;
  2100. + if (BFQQ_SEEKY(cur_bfqq))
  2101. + return NULL;
  2102. +
  2103. + /* If device has only one backlogged bfq_queue, don't search. */
  2104. + if (bfqd->busy_queues == 1)
  2105. + return NULL;
  2106. +
  2107. + /*
  2108. + * We should notice if some of the queues are cooperating, e.g.
  2109. + * working closely on the same area of the disk. In that case,
  2110. + * we can group them together and don't waste time idling.
  2111. + */
  2112. + bfqq = bfqq_close(bfqd);
  2113. + if (bfqq == NULL || bfqq == cur_bfqq)
  2114. + return NULL;
  2115. +
  2116. + /*
  2117. + * Do not merge queues from different bfq_groups.
  2118. + */
  2119. + if (bfqq->entity.parent != cur_bfqq->entity.parent)
  2120. + return NULL;
  2121. +
  2122. + /*
  2123. + * It only makes sense to merge sync queues.
  2124. + */
  2125. + if (!bfq_bfqq_sync(bfqq))
  2126. + return NULL;
  2127. + if (BFQQ_SEEKY(bfqq))
  2128. + return NULL;
  2129. +
  2130. + /*
  2131. + * Do not merge queues of different priority classes.
  2132. + */
  2133. + if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
  2134. + return NULL;
  2135. +
  2136. + return bfqq;
  2137. +}
  2138. +
  2139. +/*
  2140. + * If enough samples have been computed, return the current max budget
  2141. + * stored in bfqd, which is dynamically updated according to the
  2142. + * estimated disk peak rate; otherwise return the default max budget
  2143. + */
  2144. +static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
  2145. +{
  2146. + if (bfqd->budgets_assigned < 194)
  2147. + return bfq_default_max_budget;
  2148. + else
  2149. + return bfqd->bfq_max_budget;
  2150. +}
  2151. +
  2152. +/*
  2153. + * Return min budget, which is a fraction of the current or default
  2154. + * max budget (trying with 1/32)
  2155. + */
  2156. +static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
  2157. +{
  2158. + if (bfqd->budgets_assigned < 194)
  2159. + return bfq_default_max_budget / 32;
  2160. + else
  2161. + return bfqd->bfq_max_budget / 32;
  2162. +}
  2163. +
  2164. +static void bfq_arm_slice_timer(struct bfq_data *bfqd)
  2165. +{
  2166. + struct bfq_queue *bfqq = bfqd->in_service_queue;
  2167. + struct bfq_io_cq *bic;
  2168. + unsigned long sl;
  2169. +
  2170. + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
  2171. +
  2172. + /* Processes have exited, don't wait. */
  2173. + bic = bfqd->in_service_bic;
  2174. + if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
  2175. + return;
  2176. +
  2177. + bfq_mark_bfqq_wait_request(bfqq);
  2178. +
  2179. + /*
  2180. + * We don't want to idle for seeks, but we do want to allow
  2181. + * fair distribution of slice time for a process doing back-to-back
  2182. + * seeks. So allow a little bit of time for him to submit a new rq.
  2183. + *
  2184. + * To prevent processes with (partly) seeky workloads from
  2185. + * being too ill-treated, grant them a small fraction of the
  2186. + * assigned budget before reducing the waiting time to
  2187. + * BFQ_MIN_TT. This happened to help reduce latency.
  2188. + */
  2189. + sl = bfqd->bfq_slice_idle;
  2190. + /*
  2191. + * Unless the queue is being weight-raised, grant only minimum idle
  2192. + * time if the queue either has been seeky for long enough or has
  2193. + * already proved to be constantly seeky.
  2194. + */
  2195. + if (bfq_sample_valid(bfqq->seek_samples) &&
  2196. + ((BFQQ_SEEKY(bfqq) && bfqq->entity.service >
  2197. + bfq_max_budget(bfqq->bfqd) / 8) ||
  2198. + bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1)
  2199. + sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
  2200. + else if (bfqq->wr_coeff > 1)
  2201. + sl = sl * 3;
  2202. + bfqd->last_idling_start = ktime_get();
  2203. + mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
  2204. + bfq_log(bfqd, "arm idle: %u/%u ms",
  2205. + jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
  2206. +}
  2207. +
  2208. +/*
  2209. + * Set the maximum time for the in-service queue to consume its
  2210. + * budget. This prevents seeky processes from lowering the disk
  2211. + * throughput (always guaranteed with a time slice scheme as in CFQ).
  2212. + */
  2213. +static void bfq_set_budget_timeout(struct bfq_data *bfqd)
  2214. +{
  2215. + struct bfq_queue *bfqq = bfqd->in_service_queue;
  2216. + unsigned int timeout_coeff;
  2217. + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
  2218. + timeout_coeff = 1;
  2219. + else
  2220. + timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
  2221. +
  2222. + bfqd->last_budget_start = ktime_get();
  2223. +
  2224. + bfq_clear_bfqq_budget_new(bfqq);
  2225. + bfqq->budget_timeout = jiffies +
  2226. + bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
  2227. +
  2228. + bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
  2229. + jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
  2230. + timeout_coeff));
  2231. +}
  2232. +
  2233. +/*
  2234. + * Move request from internal lists to the request queue dispatch list.
  2235. + */
  2236. +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
  2237. +{
  2238. + struct bfq_data *bfqd = q->elevator->elevator_data;
  2239. + struct bfq_queue *bfqq = RQ_BFQQ(rq);
  2240. +
  2241. + /*
  2242. + * For consistency, the next instruction should have been executed
  2243. + * after removing the request from the queue and dispatching it.
  2244. + * We execute instead this instruction before bfq_remove_request()
  2245. + * (and hence introduce a temporary inconsistency), for efficiency.
  2246. + * In fact, in a forced_dispatch, this prevents two counters related
  2247. + * to bfqq->dispatched to risk to be uselessly decremented if bfqq
  2248. + * is not in service, and then to be incremented again after
  2249. + * incrementing bfqq->dispatched.
  2250. + */
  2251. + bfqq->dispatched++;
  2252. + bfq_remove_request(rq);
  2253. + elv_dispatch_sort(q, rq);
  2254. +
  2255. + if (bfq_bfqq_sync(bfqq))
  2256. + bfqd->sync_flight++;
  2257. +}
  2258. +
  2259. +/*
  2260. + * Return expired entry, or NULL to just start from scratch in rbtree.
  2261. + */
  2262. +static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
  2263. +{
  2264. + struct request *rq = NULL;
  2265. +
  2266. + if (bfq_bfqq_fifo_expire(bfqq))
  2267. + return NULL;
  2268. +
  2269. + bfq_mark_bfqq_fifo_expire(bfqq);
  2270. +
  2271. + if (list_empty(&bfqq->fifo))
  2272. + return NULL;
  2273. +
  2274. + rq = rq_entry_fifo(bfqq->fifo.next);
  2275. +
  2276. + if (time_before(jiffies, rq->fifo_time))
  2277. + return NULL;
  2278. +
  2279. + return rq;
  2280. +}
  2281. +
  2282. +/*
  2283. + * Must be called with the queue_lock held.
  2284. + */
  2285. +static int bfqq_process_refs(struct bfq_queue *bfqq)
  2286. +{
  2287. + int process_refs, io_refs;
  2288. +
  2289. + io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
  2290. + process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
  2291. + BUG_ON(process_refs < 0);
  2292. + return process_refs;
  2293. +}
  2294. +
  2295. +static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
  2296. +{
  2297. + int process_refs, new_process_refs;
  2298. + struct bfq_queue *__bfqq;
  2299. +
  2300. + /*
  2301. + * If there are no process references on the new_bfqq, then it is
  2302. + * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
  2303. + * may have dropped their last reference (not just their last process
  2304. + * reference).
  2305. + */
  2306. + if (!bfqq_process_refs(new_bfqq))
  2307. + return;
  2308. +
  2309. + /* Avoid a circular list and skip interim queue merges. */
  2310. + while ((__bfqq = new_bfqq->new_bfqq)) {
  2311. + if (__bfqq == bfqq)
  2312. + return;
  2313. + new_bfqq = __bfqq;
  2314. + }
  2315. +
  2316. + process_refs = bfqq_process_refs(bfqq);
  2317. + new_process_refs = bfqq_process_refs(new_bfqq);
  2318. + /*
  2319. + * If the process for the bfqq has gone away, there is no
  2320. + * sense in merging the queues.
  2321. + */
  2322. + if (process_refs == 0 || new_process_refs == 0)
  2323. + return;
  2324. +
  2325. + /*
  2326. + * Merge in the direction of the lesser amount of work.
  2327. + */
  2328. + if (new_process_refs >= process_refs) {
  2329. + bfqq->new_bfqq = new_bfqq;
  2330. + atomic_add(process_refs, &new_bfqq->ref);
  2331. + } else {
  2332. + new_bfqq->new_bfqq = bfqq;
  2333. + atomic_add(new_process_refs, &bfqq->ref);
  2334. + }
  2335. + bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
  2336. + new_bfqq->pid);
  2337. +}
  2338. +
  2339. +static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
  2340. +{
  2341. + struct bfq_entity *entity = &bfqq->entity;
  2342. + return entity->budget - entity->service;
  2343. +}
  2344. +
  2345. +static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
  2346. +{
  2347. + BUG_ON(bfqq != bfqd->in_service_queue);
  2348. +
  2349. + __bfq_bfqd_reset_in_service(bfqd);
  2350. +
  2351. + /*
  2352. + * If this bfqq is shared between multiple processes, check
  2353. + * to make sure that those processes are still issuing I/Os
  2354. + * within the mean seek distance. If not, it may be time to
  2355. + * break the queues apart again.
  2356. + */
  2357. + if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
  2358. + bfq_mark_bfqq_split_coop(bfqq);
  2359. +
  2360. + if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
  2361. + /*
  2362. + * Overloading budget_timeout field to store the time
  2363. + * at which the queue remains with no backlog; used by
  2364. + * the weight-raising mechanism.
  2365. + */
  2366. + bfqq->budget_timeout = jiffies;
  2367. + bfq_del_bfqq_busy(bfqd, bfqq, 1);
  2368. + } else {
  2369. + bfq_activate_bfqq(bfqd, bfqq);
  2370. + /*
  2371. + * Resort priority tree of potential close cooperators.
  2372. + */
  2373. + bfq_rq_pos_tree_add(bfqd, bfqq);
  2374. + }
  2375. +}
  2376. +
  2377. +/**
  2378. + * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
  2379. + * @bfqd: device data.
  2380. + * @bfqq: queue to update.
  2381. + * @reason: reason for expiration.
  2382. + *
  2383. + * Handle the feedback on @bfqq budget. See the body for detailed
  2384. + * comments.
  2385. + */
  2386. +static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
  2387. + struct bfq_queue *bfqq,
  2388. + enum bfqq_expiration reason)
  2389. +{
  2390. + struct request *next_rq;
  2391. + unsigned long budget, min_budget;
  2392. +
  2393. + budget = bfqq->max_budget;
  2394. + min_budget = bfq_min_budget(bfqd);
  2395. +
  2396. + BUG_ON(bfqq != bfqd->in_service_queue);
  2397. +
  2398. + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
  2399. + bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
  2400. + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
  2401. + budget, bfq_min_budget(bfqd));
  2402. + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
  2403. + bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
  2404. +
  2405. + if (bfq_bfqq_sync(bfqq)) {
  2406. + switch (reason) {
  2407. + /*
  2408. + * Caveat: in all the following cases we trade latency
  2409. + * for throughput.
  2410. + */
  2411. + case BFQ_BFQQ_TOO_IDLE:
  2412. + /*
  2413. + * This is the only case where we may reduce
  2414. + * the budget: if there is no request of the
  2415. + * process still waiting for completion, then
  2416. + * we assume (tentatively) that the timer has
  2417. + * expired because the batch of requests of
  2418. + * the process could have been served with a
  2419. + * smaller budget. Hence, betting that
  2420. + * process will behave in the same way when it
  2421. + * becomes backlogged again, we reduce its
  2422. + * next budget. As long as we guess right,
  2423. + * this budget cut reduces the latency
  2424. + * experienced by the process.
  2425. + *
  2426. + * However, if there are still outstanding
  2427. + * requests, then the process may have not yet
  2428. + * issued its next request just because it is
  2429. + * still waiting for the completion of some of
  2430. + * the still outstanding ones. So in this
  2431. + * subcase we do not reduce its budget, on the
  2432. + * contrary we increase it to possibly boost
  2433. + * the throughput, as discussed in the
  2434. + * comments to the BUDGET_TIMEOUT case.
  2435. + */
  2436. + if (bfqq->dispatched > 0) /* still outstanding reqs */
  2437. + budget = min(budget * 2, bfqd->bfq_max_budget);
  2438. + else {
  2439. + if (budget > 5 * min_budget)
  2440. + budget -= 4 * min_budget;
  2441. + else
  2442. + budget = min_budget;
  2443. + }
  2444. + break;
  2445. + case BFQ_BFQQ_BUDGET_TIMEOUT:
  2446. + /*
  2447. + * We double the budget here because: 1) it
  2448. + * gives the chance to boost the throughput if
  2449. + * this is not a seeky process (which may have
  2450. + * bumped into this timeout because of, e.g.,
  2451. + * ZBR), 2) together with charge_full_budget
  2452. + * it helps give seeky processes higher
  2453. + * timestamps, and hence be served less
  2454. + * frequently.
  2455. + */
  2456. + budget = min(budget * 2, bfqd->bfq_max_budget);
  2457. + break;
  2458. + case BFQ_BFQQ_BUDGET_EXHAUSTED:
  2459. + /*
  2460. + * The process still has backlog, and did not
  2461. + * let either the budget timeout or the disk
  2462. + * idling timeout expire. Hence it is not
  2463. + * seeky, has a short thinktime and may be
  2464. + * happy with a higher budget too. So
  2465. + * definitely increase the budget of this good
  2466. + * candidate to boost the disk throughput.
  2467. + */
  2468. + budget = min(budget * 4, bfqd->bfq_max_budget);
  2469. + break;
  2470. + case BFQ_BFQQ_NO_MORE_REQUESTS:
  2471. + /*
  2472. + * Leave the budget unchanged.
  2473. + */
  2474. + default:
  2475. + return;
  2476. + }
  2477. + } else /* async queue */
  2478. + /* async queues get always the maximum possible budget
  2479. + * (their ability to dispatch is limited by
  2480. + * @bfqd->bfq_max_budget_async_rq).
  2481. + */
  2482. + budget = bfqd->bfq_max_budget;
  2483. +
  2484. + bfqq->max_budget = budget;
  2485. +
  2486. + if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
  2487. + bfqq->max_budget > bfqd->bfq_max_budget)
  2488. + bfqq->max_budget = bfqd->bfq_max_budget;
  2489. +
  2490. + /*
  2491. + * Make sure that we have enough budget for the next request.
  2492. + * Since the finish time of the bfqq must be kept in sync with
  2493. + * the budget, be sure to call __bfq_bfqq_expire() after the
  2494. + * update.
  2495. + */
  2496. + next_rq = bfqq->next_rq;
  2497. + if (next_rq != NULL)
  2498. + bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
  2499. + bfq_serv_to_charge(next_rq, bfqq));
  2500. + else
  2501. + bfqq->entity.budget = bfqq->max_budget;
  2502. +
  2503. + bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
  2504. + next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
  2505. + bfqq->entity.budget);
  2506. +}
  2507. +
  2508. +static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
  2509. +{
  2510. + unsigned long max_budget;
  2511. +
  2512. + /*
  2513. + * The max_budget calculated when autotuning is equal to the
  2514. + * amount of sectors transfered in timeout_sync at the
  2515. + * estimated peak rate.
  2516. + */
  2517. + max_budget = (unsigned long)(peak_rate * 1000 *
  2518. + timeout >> BFQ_RATE_SHIFT);
  2519. +
  2520. + return max_budget;
  2521. +}
  2522. +
  2523. +/*
  2524. + * In addition to updating the peak rate, checks whether the process
  2525. + * is "slow", and returns 1 if so. This slow flag is used, in addition
  2526. + * to the budget timeout, to reduce the amount of service provided to
  2527. + * seeky processes, and hence reduce their chances to lower the
  2528. + * throughput. See the code for more details.
  2529. + */
  2530. +static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  2531. + int compensate, enum bfqq_expiration reason)
  2532. +{
  2533. + u64 bw, usecs, expected, timeout;
  2534. + ktime_t delta;
  2535. + int update = 0;
  2536. +
  2537. + if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
  2538. + return 0;
  2539. +
  2540. + if (compensate)
  2541. + delta = bfqd->last_idling_start;
  2542. + else
  2543. + delta = ktime_get();
  2544. + delta = ktime_sub(delta, bfqd->last_budget_start);
  2545. + usecs = ktime_to_us(delta);
  2546. +
  2547. + /* Don't trust short/unrealistic values. */
  2548. + if (usecs < 100 || usecs >= LONG_MAX)
  2549. + return 0;
  2550. +
  2551. + /*
  2552. + * Calculate the bandwidth for the last slice. We use a 64 bit
  2553. + * value to store the peak rate, in sectors per usec in fixed
  2554. + * point math. We do so to have enough precision in the estimate
  2555. + * and to avoid overflows.
  2556. + */
  2557. + bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
  2558. + do_div(bw, (unsigned long)usecs);
  2559. +
  2560. + timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
  2561. +
  2562. + /*
  2563. + * Use only long (> 20ms) intervals to filter out spikes for
  2564. + * the peak rate estimation.
  2565. + */
  2566. + if (usecs > 20000) {
  2567. + if (bw > bfqd->peak_rate ||
  2568. + (!BFQQ_SEEKY(bfqq) &&
  2569. + reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
  2570. + bfq_log(bfqd, "measured bw =%llu", bw);
  2571. + /*
  2572. + * To smooth oscillations use a low-pass filter with
  2573. + * alpha=7/8, i.e.,
  2574. + * new_rate = (7/8) * old_rate + (1/8) * bw
  2575. + */
  2576. + do_div(bw, 8);
  2577. + if (bw == 0)
  2578. + return 0;
  2579. + bfqd->peak_rate *= 7;
  2580. + do_div(bfqd->peak_rate, 8);
  2581. + bfqd->peak_rate += bw;
  2582. + update = 1;
  2583. + bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
  2584. + }
  2585. +
  2586. + update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
  2587. +
  2588. + if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
  2589. + bfqd->peak_rate_samples++;
  2590. +
  2591. + if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
  2592. + update) {
  2593. + int dev_type = blk_queue_nonrot(bfqd->queue);
  2594. + if (bfqd->bfq_user_max_budget == 0) {
  2595. + bfqd->bfq_max_budget =
  2596. + bfq_calc_max_budget(bfqd->peak_rate,
  2597. + timeout);
  2598. + bfq_log(bfqd, "new max_budget=%lu",
  2599. + bfqd->bfq_max_budget);
  2600. + }
  2601. + if (bfqd->device_speed == BFQ_BFQD_FAST &&
  2602. + bfqd->peak_rate < device_speed_thresh[dev_type]) {
  2603. + bfqd->device_speed = BFQ_BFQD_SLOW;
  2604. + bfqd->RT_prod = R_slow[dev_type] *
  2605. + T_slow[dev_type];
  2606. + } else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
  2607. + bfqd->peak_rate > device_speed_thresh[dev_type]) {
  2608. + bfqd->device_speed = BFQ_BFQD_FAST;
  2609. + bfqd->RT_prod = R_fast[dev_type] *
  2610. + T_fast[dev_type];
  2611. + }
  2612. + }
  2613. + }
  2614. +
  2615. + /*
  2616. + * If the process has been served for a too short time
  2617. + * interval to let its possible sequential accesses prevail on
  2618. + * the initial seek time needed to move the disk head on the
  2619. + * first sector it requested, then give the process a chance
  2620. + * and for the moment return false.
  2621. + */
  2622. + if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
  2623. + return 0;
  2624. +
  2625. + /*
  2626. + * A process is considered ``slow'' (i.e., seeky, so that we
  2627. + * cannot treat it fairly in the service domain, as it would
  2628. + * slow down too much the other processes) if, when a slice
  2629. + * ends for whatever reason, it has received service at a
  2630. + * rate that would not be high enough to complete the budget
  2631. + * before the budget timeout expiration.
  2632. + */
  2633. + expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
  2634. +
  2635. + /*
  2636. + * Caveat: processes doing IO in the slower disk zones will
  2637. + * tend to be slow(er) even if not seeky. And the estimated
  2638. + * peak rate will actually be an average over the disk
  2639. + * surface. Hence, to not be too harsh with unlucky processes,
  2640. + * we keep a budget/3 margin of safety before declaring a
  2641. + * process slow.
  2642. + */
  2643. + return expected > (4 * bfqq->entity.budget) / 3;
  2644. +}
  2645. +
  2646. +/*
  2647. + * To be deemed as soft real-time, an application must meet two
  2648. + * requirements. First, the application must not require an average
  2649. + * bandwidth higher than the approximate bandwidth required to playback or
  2650. + * record a compressed high-definition video.
  2651. + * The next function is invoked on the completion of the last request of a
  2652. + * batch, to compute the next-start time instant, soft_rt_next_start, such
  2653. + * that, if the next request of the application does not arrive before
  2654. + * soft_rt_next_start, then the above requirement on the bandwidth is met.
  2655. + *
  2656. + * The second requirement is that the request pattern of the application is
  2657. + * isochronous, i.e., that, after issuing a request or a batch of requests,
  2658. + * the application stops issuing new requests until all its pending requests
  2659. + * have been completed. After that, the application may issue a new batch,
  2660. + * and so on.
  2661. + * For this reason the next function is invoked to compute
  2662. + * soft_rt_next_start only for applications that meet this requirement,
  2663. + * whereas soft_rt_next_start is set to infinity for applications that do
  2664. + * not.
  2665. + *
  2666. + * Unfortunately, even a greedy application may happen to behave in an
  2667. + * isochronous way if the CPU load is high. In fact, the application may
  2668. + * stop issuing requests while the CPUs are busy serving other processes,
  2669. + * then restart, then stop again for a while, and so on. In addition, if
  2670. + * the disk achieves a low enough throughput with the request pattern
  2671. + * issued by the application (e.g., because the request pattern is random
  2672. + * and/or the device is slow), then the application may meet the above
  2673. + * bandwidth requirement too. To prevent such a greedy application to be
  2674. + * deemed as soft real-time, a further rule is used in the computation of
  2675. + * soft_rt_next_start: soft_rt_next_start must be higher than the current
  2676. + * time plus the maximum time for which the arrival of a request is waited
  2677. + * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
  2678. + * This filters out greedy applications, as the latter issue instead their
  2679. + * next request as soon as possible after the last one has been completed
  2680. + * (in contrast, when a batch of requests is completed, a soft real-time
  2681. + * application spends some time processing data).
  2682. + *
  2683. + * Unfortunately, the last filter may easily generate false positives if
  2684. + * only bfqd->bfq_slice_idle is used as a reference time interval and one
  2685. + * or both the following cases occur:
  2686. + * 1) HZ is so low that the duration of a jiffy is comparable to or higher
  2687. + * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
  2688. + * HZ=100.
  2689. + * 2) jiffies, instead of increasing at a constant rate, may stop increasing
  2690. + * for a while, then suddenly 'jump' by several units to recover the lost
  2691. + * increments. This seems to happen, e.g., inside virtual machines.
  2692. + * To address this issue, we do not use as a reference time interval just
  2693. + * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
  2694. + * particular we add the minimum number of jiffies for which the filter
  2695. + * seems to be quite precise also in embedded systems and KVM/QEMU virtual
  2696. + * machines.
  2697. + */
  2698. +static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
  2699. + struct bfq_queue *bfqq)
  2700. +{
  2701. + return max(bfqq->last_idle_bklogged +
  2702. + HZ * bfqq->service_from_backlogged /
  2703. + bfqd->bfq_wr_max_softrt_rate,
  2704. + jiffies + bfqq->bfqd->bfq_slice_idle + 4);
  2705. +}
  2706. +
  2707. +/*
  2708. + * Return the largest-possible time instant such that, for as long as possible,
  2709. + * the current time will be lower than this time instant according to the macro
  2710. + * time_is_before_jiffies().
  2711. + */
  2712. +static inline unsigned long bfq_infinity_from_now(unsigned long now)
  2713. +{
  2714. + return now + ULONG_MAX / 2;
  2715. +}
  2716. +
  2717. +/**
  2718. + * bfq_bfqq_expire - expire a queue.
  2719. + * @bfqd: device owning the queue.
  2720. + * @bfqq: the queue to expire.
  2721. + * @compensate: if true, compensate for the time spent idling.
  2722. + * @reason: the reason causing the expiration.
  2723. + *
  2724. + *
  2725. + * If the process associated to the queue is slow (i.e., seeky), or in
  2726. + * case of budget timeout, or, finally, if it is async, we
  2727. + * artificially charge it an entire budget (independently of the
  2728. + * actual service it received). As a consequence, the queue will get
  2729. + * higher timestamps than the correct ones upon reactivation, and
  2730. + * hence it will be rescheduled as if it had received more service
  2731. + * than what it actually received. In the end, this class of processes
  2732. + * will receive less service in proportion to how slowly they consume
  2733. + * their budgets (and hence how seriously they tend to lower the
  2734. + * throughput).
  2735. + *
  2736. + * In contrast, when a queue expires because it has been idling for
  2737. + * too much or because it exhausted its budget, we do not touch the
  2738. + * amount of service it has received. Hence when the queue will be
  2739. + * reactivated and its timestamps updated, the latter will be in sync
  2740. + * with the actual service received by the queue until expiration.
  2741. + *
  2742. + * Charging a full budget to the first type of queues and the exact
  2743. + * service to the others has the effect of using the WF2Q+ policy to
  2744. + * schedule the former on a timeslice basis, without violating the
  2745. + * service domain guarantees of the latter.
  2746. + */
  2747. +static void bfq_bfqq_expire(struct bfq_data *bfqd,
  2748. + struct bfq_queue *bfqq,
  2749. + int compensate,
  2750. + enum bfqq_expiration reason)
  2751. +{
  2752. + int slow;
  2753. + BUG_ON(bfqq != bfqd->in_service_queue);
  2754. +
  2755. + /* Update disk peak rate for autotuning and check whether the
  2756. + * process is slow (see bfq_update_peak_rate).
  2757. + */
  2758. + slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
  2759. +
  2760. + /*
  2761. + * As above explained, 'punish' slow (i.e., seeky), timed-out
  2762. + * and async queues, to favor sequential sync workloads.
  2763. + *
  2764. + * Processes doing I/O in the slower disk zones will tend to be
  2765. + * slow(er) even if not seeky. Hence, since the estimated peak
  2766. + * rate is actually an average over the disk surface, these
  2767. + * processes may timeout just for bad luck. To avoid punishing
  2768. + * them we do not charge a full budget to a process that
  2769. + * succeeded in consuming at least 2/3 of its budget.
  2770. + */
  2771. + if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
  2772. + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
  2773. + bfq_bfqq_charge_full_budget(bfqq);
  2774. +
  2775. + bfqq->service_from_backlogged += bfqq->entity.service;
  2776. +
  2777. + if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
  2778. + !bfq_bfqq_constantly_seeky(bfqq)) {
  2779. + bfq_mark_bfqq_constantly_seeky(bfqq);
  2780. + if (!blk_queue_nonrot(bfqd->queue))
  2781. + bfqd->const_seeky_busy_in_flight_queues++;
  2782. + }
  2783. +
  2784. + if (reason == BFQ_BFQQ_TOO_IDLE &&
  2785. + bfqq->entity.service <= 2 * bfqq->entity.budget / 10 )
  2786. + bfq_clear_bfqq_IO_bound(bfqq);
  2787. +
  2788. + if (bfqd->low_latency && bfqq->wr_coeff == 1)
  2789. + bfqq->last_wr_start_finish = jiffies;
  2790. +
  2791. + if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
  2792. + RB_EMPTY_ROOT(&bfqq->sort_list)) {
  2793. + /*
  2794. + * If we get here, and there are no outstanding requests,
  2795. + * then the request pattern is isochronous (see the comments
  2796. + * to the function bfq_bfqq_softrt_next_start()). Hence we
  2797. + * can compute soft_rt_next_start. If, instead, the queue
  2798. + * still has outstanding requests, then we have to wait
  2799. + * for the completion of all the outstanding requests to
  2800. + * discover whether the request pattern is actually
  2801. + * isochronous.
  2802. + */
  2803. + if (bfqq->dispatched == 0)
  2804. + bfqq->soft_rt_next_start =
  2805. + bfq_bfqq_softrt_next_start(bfqd, bfqq);
  2806. + else {
  2807. + /*
  2808. + * The application is still waiting for the
  2809. + * completion of one or more requests:
  2810. + * prevent it from possibly being incorrectly
  2811. + * deemed as soft real-time by setting its
  2812. + * soft_rt_next_start to infinity. In fact,
  2813. + * without this assignment, the application
  2814. + * would be incorrectly deemed as soft
  2815. + * real-time if:
  2816. + * 1) it issued a new request before the
  2817. + * completion of all its in-flight
  2818. + * requests, and
  2819. + * 2) at that time, its soft_rt_next_start
  2820. + * happened to be in the past.
  2821. + */
  2822. + bfqq->soft_rt_next_start =
  2823. + bfq_infinity_from_now(jiffies);
  2824. + /*
  2825. + * Schedule an update of soft_rt_next_start to when
  2826. + * the task may be discovered to be isochronous.
  2827. + */
  2828. + bfq_mark_bfqq_softrt_update(bfqq);
  2829. + }
  2830. + }
  2831. +
  2832. + bfq_log_bfqq(bfqd, bfqq,
  2833. + "expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
  2834. + slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
  2835. +
  2836. + /*
  2837. + * Increase, decrease or leave budget unchanged according to
  2838. + * reason.
  2839. + */
  2840. + __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
  2841. + __bfq_bfqq_expire(bfqd, bfqq);
  2842. +}
  2843. +
  2844. +/*
  2845. + * Budget timeout is not implemented through a dedicated timer, but
  2846. + * just checked on request arrivals and completions, as well as on
  2847. + * idle timer expirations.
  2848. + */
  2849. +static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
  2850. +{
  2851. + if (bfq_bfqq_budget_new(bfqq) ||
  2852. + time_before(jiffies, bfqq->budget_timeout))
  2853. + return 0;
  2854. + return 1;
  2855. +}
  2856. +
  2857. +/*
  2858. + * If we expire a queue that is waiting for the arrival of a new
  2859. + * request, we may prevent the fictitious timestamp back-shifting that
  2860. + * allows the guarantees of the queue to be preserved (see [1] for
  2861. + * this tricky aspect). Hence we return true only if this condition
  2862. + * does not hold, or if the queue is slow enough to deserve only to be
  2863. + * kicked off for preserving a high throughput.
  2864. +*/
  2865. +static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
  2866. +{
  2867. + bfq_log_bfqq(bfqq->bfqd, bfqq,
  2868. + "may_budget_timeout: wait_request %d left %d timeout %d",
  2869. + bfq_bfqq_wait_request(bfqq),
  2870. + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
  2871. + bfq_bfqq_budget_timeout(bfqq));
  2872. +
  2873. + return (!bfq_bfqq_wait_request(bfqq) ||
  2874. + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
  2875. + &&
  2876. + bfq_bfqq_budget_timeout(bfqq);
  2877. +}
  2878. +
  2879. +/*
  2880. + * Device idling is allowed only for the queues for which this function
  2881. + * returns true. For this reason, the return value of this function plays a
  2882. + * critical role for both throughput boosting and service guarantees. The
  2883. + * return value is computed through a logical expression. In this rather
  2884. + * long comment, we try to briefly describe all the details and motivations
  2885. + * behind the components of this logical expression.
  2886. + *
  2887. + * First, the expression may be true only for sync queues. Besides, if
  2888. + * bfqq is also being weight-raised, then the expression always evaluates
  2889. + * to true, as device idling is instrumental for preserving low-latency
  2890. + * guarantees (see [1]). Otherwise, the expression evaluates to true only
  2891. + * if bfqq has a non-null idle window and at least one of the following
  2892. + * two conditions holds. The first condition is that the device is not
  2893. + * performing NCQ, because idling the device most certainly boosts the
  2894. + * throughput if this condition holds and bfqq has been granted a non-null
  2895. + * idle window. The second compound condition is made of the logical AND of
  2896. + * two components.
  2897. + *
  2898. + * The first component is true only if there is no weight-raised busy
  2899. + * queue. This guarantees that the device is not idled for a sync non-
  2900. + * weight-raised queue when there are busy weight-raised queues. The former
  2901. + * is then expired immediately if empty. Combined with the timestamping
  2902. + * rules of BFQ (see [1] for details), this causes sync non-weight-raised
  2903. + * queues to get a lower number of requests served, and hence to ask for a
  2904. + * lower number of requests from the request pool, before the busy weight-
  2905. + * raised queues get served again.
  2906. + *
  2907. + * This is beneficial for the processes associated with weight-raised
  2908. + * queues, when the request pool is saturated (e.g., in the presence of
  2909. + * write hogs). In fact, if the processes associated with the other queues
  2910. + * ask for requests at a lower rate, then weight-raised processes have a
  2911. + * higher probability to get a request from the pool immediately (or at
  2912. + * least soon) when they need one. Hence they have a higher probability to
  2913. + * actually get a fraction of the disk throughput proportional to their
  2914. + * high weight. This is especially true with NCQ-capable drives, which
  2915. + * enqueue several requests in advance and further reorder internally-
  2916. + * queued requests.
  2917. + *
  2918. + * In the end, mistreating non-weight-raised queues when there are busy
  2919. + * weight-raised queues seems to mitigate starvation problems in the
  2920. + * presence of heavy write workloads and NCQ, and hence to guarantee a
  2921. + * higher application and system responsiveness in these hostile scenarios.
  2922. + *
  2923. + * If the first component of the compound condition is instead true, i.e.,
  2924. + * there is no weight-raised busy queue, then the second component of the
  2925. + * compound condition takes into account service-guarantee and throughput
  2926. + * issues related to NCQ (recall that the compound condition is evaluated
  2927. + * only if the device is detected as supporting NCQ).
  2928. + *
  2929. + * As for service guarantees, allowing the drive to enqueue more than one
  2930. + * request at a time, and hence delegating de facto final scheduling
  2931. + * decisions to the drive's internal scheduler, causes loss of control on
  2932. + * the actual request service order. In this respect, when the drive is
  2933. + * allowed to enqueue more than one request at a time, the service
  2934. + * distribution enforced by the drive's internal scheduler is likely to
  2935. + * coincide with the desired device-throughput distribution only in the
  2936. + * following, perfectly symmetric, scenario:
  2937. + * 1) all active queues have the same weight,
  2938. + * 2) all active groups at the same level in the groups tree have the same
  2939. + * weight,
  2940. + * 3) all active groups at the same level in the groups tree have the same
  2941. + * number of children.
  2942. + *
  2943. + * Even in such a scenario, sequential I/O may still receive a preferential
  2944. + * treatment, but this is not likely to be a big issue with flash-based
  2945. + * devices, because of their non-dramatic loss of throughput with random
  2946. + * I/O. Things do differ with HDDs, for which additional care is taken, as
  2947. + * explained after completing the discussion for flash-based devices.
  2948. + *
  2949. + * Unfortunately, keeping the necessary state for evaluating exactly the
  2950. + * above symmetry conditions would be quite complex and time-consuming.
  2951. + * Therefore BFQ evaluates instead the following stronger sub-conditions,
  2952. + * for which it is much easier to maintain the needed state:
  2953. + * 1) all active queues have the same weight,
  2954. + * 2) all active groups have the same weight,
  2955. + * 3) all active groups have at most one active child each.
  2956. + * In particular, the last two conditions are always true if hierarchical
  2957. + * support and the cgroups interface are not enabled, hence no state needs
  2958. + * to be maintained in this case.
  2959. + *
  2960. + * According to the above considerations, the second component of the
  2961. + * compound condition evaluates to true if any of the above symmetry
  2962. + * sub-condition does not hold, or the device is not flash-based. Therefore,
  2963. + * if also the first component is true, then idling is allowed for a sync
  2964. + * queue. These are the only sub-conditions considered if the device is
  2965. + * flash-based, as, for such a device, it is sensible to force idling only
  2966. + * for service-guarantee issues. In fact, as for throughput, idling
  2967. + * NCQ-capable flash-based devices would not boost the throughput even
  2968. + * with sequential I/O; rather it would lower the throughput in proportion
  2969. + * to how fast the device is. In the end, (only) if all the three
  2970. + * sub-conditions hold and the device is flash-based, the compound
  2971. + * condition evaluates to false and therefore no idling is performed.
  2972. + *
  2973. + * As already said, things change with a rotational device, where idling
  2974. + * boosts the throughput with sequential I/O (even with NCQ). Hence, for
  2975. + * such a device the second component of the compound condition evaluates
  2976. + * to true also if the following additional sub-condition does not hold:
  2977. + * the queue is constantly seeky. Unfortunately, this different behavior
  2978. + * with respect to flash-based devices causes an additional asymmetry: if
  2979. + * some sync queues enjoy idling and some other sync queues do not, then
  2980. + * the latter get a low share of the device throughput, simply because the
  2981. + * former get many requests served after being set as in service, whereas
  2982. + * the latter do not. As a consequence, to guarantee the desired throughput
  2983. + * distribution, on HDDs the compound expression evaluates to true (and
  2984. + * hence device idling is performed) also if the following last symmetry
  2985. + * condition does not hold: no other queue is benefiting from idling. Also
  2986. + * this last condition is actually replaced with a simpler-to-maintain and
  2987. + * stronger condition: there is no busy queue which is not constantly seeky
  2988. + * (and hence may also benefit from idling).
  2989. + *
  2990. + * To sum up, when all the required symmetry and throughput-boosting
  2991. + * sub-conditions hold, the second component of the compound condition
  2992. + * evaluates to false, and hence no idling is performed. This helps to
  2993. + * keep the drives' internal queues full on NCQ-capable devices, and hence
  2994. + * to boost the throughput, without causing 'almost' any loss of service
  2995. + * guarantees. The 'almost' follows from the fact that, if the internal
  2996. + * queue of one such device is filled while all the sub-conditions hold,
  2997. + * but at some point in time some sub-condition stops to hold, then it may
  2998. + * become impossible to let requests be served in the new desired order
  2999. + * until all the requests already queued in the device have been served.
  3000. + */
  3001. +static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)
  3002. +{
  3003. + struct bfq_data *bfqd = bfqq->bfqd;
  3004. +#ifdef CONFIG_CGROUP_BFQIO
  3005. +#define symmetric_scenario (!bfqd->active_numerous_groups && \
  3006. + !bfq_differentiated_weights(bfqd))
  3007. +#else
  3008. +#define symmetric_scenario (!bfq_differentiated_weights(bfqd))
  3009. +#endif
  3010. +#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \
  3011. + bfqd->busy_in_flight_queues == \
  3012. + bfqd->const_seeky_busy_in_flight_queues)
  3013. +/*
  3014. + * Condition for expiring a non-weight-raised queue (and hence not idling
  3015. + * the device).
  3016. + */
  3017. +#define cond_for_expiring_non_wr (bfqd->hw_tag && \
  3018. + (bfqd->wr_busy_queues > 0 || \
  3019. + (symmetric_scenario && \
  3020. + (blk_queue_nonrot(bfqd->queue) || \
  3021. + cond_for_seeky_on_ncq_hdd))))
  3022. +
  3023. + return bfq_bfqq_sync(bfqq) &&
  3024. + (bfq_bfqq_IO_bound(bfqq) || bfqq->wr_coeff > 1) &&
  3025. + (bfqq->wr_coeff > 1 ||
  3026. + (bfq_bfqq_idle_window(bfqq) &&
  3027. + !cond_for_expiring_non_wr)
  3028. + );
  3029. +}
  3030. +
  3031. +/*
  3032. + * If the in-service queue is empty but sync, and the function
  3033. + * bfq_bfqq_must_not_expire returns true, then:
  3034. + * 1) the queue must remain in service and cannot be expired, and
  3035. + * 2) the disk must be idled to wait for the possible arrival of a new
  3036. + * request for the queue.
  3037. + * See the comments to the function bfq_bfqq_must_not_expire for the reasons
  3038. + * why performing device idling is the best choice to boost the throughput
  3039. + * and preserve service guarantees when bfq_bfqq_must_not_expire itself
  3040. + * returns true.
  3041. + */
  3042. +static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
  3043. +{
  3044. + struct bfq_data *bfqd = bfqq->bfqd;
  3045. +
  3046. + return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
  3047. + bfq_bfqq_must_not_expire(bfqq);
  3048. +}
  3049. +
  3050. +/*
  3051. + * Select a queue for service. If we have a current queue in service,
  3052. + * check whether to continue servicing it, or retrieve and set a new one.
  3053. + */
  3054. +static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
  3055. +{
  3056. + struct bfq_queue *bfqq, *new_bfqq = NULL;
  3057. + struct request *next_rq;
  3058. + enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
  3059. +
  3060. + bfqq = bfqd->in_service_queue;
  3061. + if (bfqq == NULL)
  3062. + goto new_queue;
  3063. +
  3064. + bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
  3065. +
  3066. + /*
  3067. + * If another queue has a request waiting within our mean seek
  3068. + * distance, let it run. The expire code will check for close
  3069. + * cooperators and put the close queue at the front of the
  3070. + * service tree. If possible, merge the expiring queue with the
  3071. + * new bfqq.
  3072. + */
  3073. + new_bfqq = bfq_close_cooperator(bfqd, bfqq);
  3074. + if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
  3075. + bfq_setup_merge(bfqq, new_bfqq);
  3076. +
  3077. + if (bfq_may_expire_for_budg_timeout(bfqq) &&
  3078. + !timer_pending(&bfqd->idle_slice_timer) &&
  3079. + !bfq_bfqq_must_idle(bfqq))
  3080. + goto expire;
  3081. +
  3082. + next_rq = bfqq->next_rq;
  3083. + /*
  3084. + * If bfqq has requests queued and it has enough budget left to
  3085. + * serve them, keep the queue, otherwise expire it.
  3086. + */
  3087. + if (next_rq != NULL) {
  3088. + if (bfq_serv_to_charge(next_rq, bfqq) >
  3089. + bfq_bfqq_budget_left(bfqq)) {
  3090. + reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
  3091. + goto expire;
  3092. + } else {
  3093. + /*
  3094. + * The idle timer may be pending because we may
  3095. + * not disable disk idling even when a new request
  3096. + * arrives.
  3097. + */
  3098. + if (timer_pending(&bfqd->idle_slice_timer)) {
  3099. + /*
  3100. + * If we get here: 1) at least a new request
  3101. + * has arrived but we have not disabled the
  3102. + * timer because the request was too small,
  3103. + * 2) then the block layer has unplugged
  3104. + * the device, causing the dispatch to be
  3105. + * invoked.
  3106. + *
  3107. + * Since the device is unplugged, now the
  3108. + * requests are probably large enough to
  3109. + * provide a reasonable throughput.
  3110. + * So we disable idling.
  3111. + */
  3112. + bfq_clear_bfqq_wait_request(bfqq);
  3113. + del_timer(&bfqd->idle_slice_timer);
  3114. + }
  3115. + if (new_bfqq == NULL)
  3116. + goto keep_queue;
  3117. + else
  3118. + goto expire;
  3119. + }
  3120. + }
  3121. +
  3122. + /*
  3123. + * No requests pending. If the in-service queue still has requests
  3124. + * in flight (possibly waiting for a completion) or is idling for a
  3125. + * new request, then keep it.
  3126. + */
  3127. + if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
  3128. + (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {
  3129. + bfqq = NULL;
  3130. + goto keep_queue;
  3131. + } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
  3132. + /*
  3133. + * Expiring the queue because there is a close cooperator,
  3134. + * cancel timer.
  3135. + */
  3136. + bfq_clear_bfqq_wait_request(bfqq);
  3137. + del_timer(&bfqd->idle_slice_timer);
  3138. + }
  3139. +
  3140. + reason = BFQ_BFQQ_NO_MORE_REQUESTS;
  3141. +expire:
  3142. + bfq_bfqq_expire(bfqd, bfqq, 0, reason);
  3143. +new_queue:
  3144. + bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);
  3145. + bfq_log(bfqd, "select_queue: new queue %d returned",
  3146. + bfqq != NULL ? bfqq->pid : 0);
  3147. +keep_queue:
  3148. + return bfqq;
  3149. +}
  3150. +
  3151. +static void bfq_update_wr_data(struct bfq_data *bfqd,
  3152. + struct bfq_queue *bfqq)
  3153. +{
  3154. + if (bfqq->wr_coeff > 1) { /* queue is being boosted */
  3155. + struct bfq_entity *entity = &bfqq->entity;
  3156. +
  3157. + bfq_log_bfqq(bfqd, bfqq,
  3158. + "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
  3159. + jiffies_to_msecs(jiffies -
  3160. + bfqq->last_wr_start_finish),
  3161. + jiffies_to_msecs(bfqq->wr_cur_max_time),
  3162. + bfqq->wr_coeff,
  3163. + bfqq->entity.weight, bfqq->entity.orig_weight);
  3164. +
  3165. + BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
  3166. + entity->orig_weight * bfqq->wr_coeff);
  3167. + if (entity->ioprio_changed)
  3168. + bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
  3169. + /*
  3170. + * If too much time has elapsed from the beginning
  3171. + * of this weight-raising, stop it.
  3172. + */
  3173. + if (time_is_before_jiffies(bfqq->last_wr_start_finish +
  3174. + bfqq->wr_cur_max_time)) {
  3175. + bfqq->last_wr_start_finish = jiffies;
  3176. + bfq_log_bfqq(bfqd, bfqq,
  3177. + "wrais ending at %lu, rais_max_time %u",
  3178. + bfqq->last_wr_start_finish,
  3179. + jiffies_to_msecs(bfqq->wr_cur_max_time));
  3180. + bfq_bfqq_end_wr(bfqq);
  3181. + __bfq_entity_update_weight_prio(
  3182. + bfq_entity_service_tree(entity),
  3183. + entity);
  3184. + }
  3185. + }
  3186. +}
  3187. +
  3188. +/*
  3189. + * Dispatch one request from bfqq, moving it to the request queue
  3190. + * dispatch list.
  3191. + */
  3192. +static int bfq_dispatch_request(struct bfq_data *bfqd,
  3193. + struct bfq_queue *bfqq)
  3194. +{
  3195. + int dispatched = 0;
  3196. + struct request *rq;
  3197. + unsigned long service_to_charge;
  3198. +
  3199. + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
  3200. +
  3201. + /* Follow expired path, else get first next available. */
  3202. + rq = bfq_check_fifo(bfqq);
  3203. + if (rq == NULL)
  3204. + rq = bfqq->next_rq;
  3205. + service_to_charge = bfq_serv_to_charge(rq, bfqq);
  3206. +
  3207. + if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
  3208. + /*
  3209. + * This may happen if the next rq is chosen in fifo order
  3210. + * instead of sector order. The budget is properly
  3211. + * dimensioned to be always sufficient to serve the next
  3212. + * request only if it is chosen in sector order. The reason
  3213. + * is that it would be quite inefficient and little useful
  3214. + * to always make sure that the budget is large enough to
  3215. + * serve even the possible next rq in fifo order.
  3216. + * In fact, requests are seldom served in fifo order.
  3217. + *
  3218. + * Expire the queue for budget exhaustion, and make sure
  3219. + * that the next act_budget is enough to serve the next
  3220. + * request, even if it comes from the fifo expired path.
  3221. + */
  3222. + bfqq->next_rq = rq;
  3223. + /*
  3224. + * Since this dispatch is failed, make sure that
  3225. + * a new one will be performed
  3226. + */
  3227. + if (!bfqd->rq_in_driver)
  3228. + bfq_schedule_dispatch(bfqd);
  3229. + goto expire;
  3230. + }
  3231. +
  3232. + /* Finally, insert request into driver dispatch list. */
  3233. + bfq_bfqq_served(bfqq, service_to_charge);
  3234. + bfq_dispatch_insert(bfqd->queue, rq);
  3235. +
  3236. + bfq_update_wr_data(bfqd, bfqq);
  3237. +
  3238. + bfq_log_bfqq(bfqd, bfqq,
  3239. + "dispatched %u sec req (%llu), budg left %lu",
  3240. + blk_rq_sectors(rq),
  3241. + (long long unsigned)blk_rq_pos(rq),
  3242. + bfq_bfqq_budget_left(bfqq));
  3243. +
  3244. + dispatched++;
  3245. +
  3246. + if (bfqd->in_service_bic == NULL) {
  3247. + atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
  3248. + bfqd->in_service_bic = RQ_BIC(rq);
  3249. + }
  3250. +
  3251. + if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
  3252. + dispatched >= bfqd->bfq_max_budget_async_rq) ||
  3253. + bfq_class_idle(bfqq)))
  3254. + goto expire;
  3255. +
  3256. + return dispatched;
  3257. +
  3258. +expire:
  3259. + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
  3260. + return dispatched;
  3261. +}
  3262. +
  3263. +static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
  3264. +{
  3265. + int dispatched = 0;
  3266. +
  3267. + while (bfqq->next_rq != NULL) {
  3268. + bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
  3269. + dispatched++;
  3270. + }
  3271. +
  3272. + BUG_ON(!list_empty(&bfqq->fifo));
  3273. + return dispatched;
  3274. +}
  3275. +
  3276. +/*
  3277. + * Drain our current requests.
  3278. + * Used for barriers and when switching io schedulers on-the-fly.
  3279. + */
  3280. +static int bfq_forced_dispatch(struct bfq_data *bfqd)
  3281. +{
  3282. + struct bfq_queue *bfqq, *n;
  3283. + struct bfq_service_tree *st;
  3284. + int dispatched = 0;
  3285. +
  3286. + bfqq = bfqd->in_service_queue;
  3287. + if (bfqq != NULL)
  3288. + __bfq_bfqq_expire(bfqd, bfqq);
  3289. +
  3290. + /*
  3291. + * Loop through classes, and be careful to leave the scheduler
  3292. + * in a consistent state, as feedback mechanisms and vtime
  3293. + * updates cannot be disabled during the process.
  3294. + */
  3295. + list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
  3296. + st = bfq_entity_service_tree(&bfqq->entity);
  3297. +
  3298. + dispatched += __bfq_forced_dispatch_bfqq(bfqq);
  3299. + bfqq->max_budget = bfq_max_budget(bfqd);
  3300. +
  3301. + bfq_forget_idle(st);
  3302. + }
  3303. +
  3304. + BUG_ON(bfqd->busy_queues != 0);
  3305. +
  3306. + return dispatched;
  3307. +}
  3308. +
  3309. +static int bfq_dispatch_requests(struct request_queue *q, int force)
  3310. +{
  3311. + struct bfq_data *bfqd = q->elevator->elevator_data;
  3312. + struct bfq_queue *bfqq;
  3313. + int max_dispatch;
  3314. +
  3315. + bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
  3316. + if (bfqd->busy_queues == 0)
  3317. + return 0;
  3318. +
  3319. + if (unlikely(force))
  3320. + return bfq_forced_dispatch(bfqd);
  3321. +
  3322. + bfqq = bfq_select_queue(bfqd);
  3323. + if (bfqq == NULL)
  3324. + return 0;
  3325. +
  3326. + max_dispatch = bfqd->bfq_quantum;
  3327. + if (bfq_class_idle(bfqq))
  3328. + max_dispatch = 1;
  3329. +
  3330. + if (!bfq_bfqq_sync(bfqq))
  3331. + max_dispatch = bfqd->bfq_max_budget_async_rq;
  3332. +
  3333. + if (bfqq->dispatched >= max_dispatch) {
  3334. + if (bfqd->busy_queues > 1)
  3335. + return 0;
  3336. + if (bfqq->dispatched >= 4 * max_dispatch)
  3337. + return 0;
  3338. + }
  3339. +
  3340. + if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
  3341. + return 0;
  3342. +
  3343. + bfq_clear_bfqq_wait_request(bfqq);
  3344. + BUG_ON(timer_pending(&bfqd->idle_slice_timer));
  3345. +
  3346. + if (!bfq_dispatch_request(bfqd, bfqq))
  3347. + return 0;
  3348. +
  3349. + bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)",
  3350. + bfqq->pid, max_dispatch);
  3351. +
  3352. + return 1;
  3353. +}
  3354. +
  3355. +/*
  3356. + * Task holds one reference to the queue, dropped when task exits. Each rq
  3357. + * in-flight on this queue also holds a reference, dropped when rq is freed.
  3358. + *
  3359. + * Queue lock must be held here.
  3360. + */
  3361. +static void bfq_put_queue(struct bfq_queue *bfqq)
  3362. +{
  3363. + struct bfq_data *bfqd = bfqq->bfqd;
  3364. +
  3365. + BUG_ON(atomic_read(&bfqq->ref) <= 0);
  3366. +
  3367. + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
  3368. + atomic_read(&bfqq->ref));
  3369. + if (!atomic_dec_and_test(&bfqq->ref))
  3370. + return;
  3371. +
  3372. + BUG_ON(rb_first(&bfqq->sort_list) != NULL);
  3373. + BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
  3374. + BUG_ON(bfqq->entity.tree != NULL);
  3375. + BUG_ON(bfq_bfqq_busy(bfqq));
  3376. + BUG_ON(bfqd->in_service_queue == bfqq);
  3377. +
  3378. + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
  3379. +
  3380. + kmem_cache_free(bfq_pool, bfqq);
  3381. +}
  3382. +
  3383. +static void bfq_put_cooperator(struct bfq_queue *bfqq)
  3384. +{
  3385. + struct bfq_queue *__bfqq, *next;
  3386. +
  3387. + /*
  3388. + * If this queue was scheduled to merge with another queue, be
  3389. + * sure to drop the reference taken on that queue (and others in
  3390. + * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
  3391. + */
  3392. + __bfqq = bfqq->new_bfqq;
  3393. + while (__bfqq) {
  3394. + if (__bfqq == bfqq)
  3395. + break;
  3396. + next = __bfqq->new_bfqq;
  3397. + bfq_put_queue(__bfqq);
  3398. + __bfqq = next;
  3399. + }
  3400. +}
  3401. +
  3402. +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
  3403. +{
  3404. + if (bfqq == bfqd->in_service_queue) {
  3405. + __bfq_bfqq_expire(bfqd, bfqq);
  3406. + bfq_schedule_dispatch(bfqd);
  3407. + }
  3408. +
  3409. + bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
  3410. + atomic_read(&bfqq->ref));
  3411. +
  3412. + bfq_put_cooperator(bfqq);
  3413. +
  3414. + bfq_put_queue(bfqq);
  3415. +}
  3416. +
  3417. +static inline void bfq_init_icq(struct io_cq *icq)
  3418. +{
  3419. + struct bfq_io_cq *bic = icq_to_bic(icq);
  3420. +
  3421. + bic->ttime.last_end_request = jiffies;
  3422. +}
  3423. +
  3424. +static void bfq_exit_icq(struct io_cq *icq)
  3425. +{
  3426. + struct bfq_io_cq *bic = icq_to_bic(icq);
  3427. + struct bfq_data *bfqd = bic_to_bfqd(bic);
  3428. +
  3429. + if (bic->bfqq[BLK_RW_ASYNC]) {
  3430. + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
  3431. + bic->bfqq[BLK_RW_ASYNC] = NULL;
  3432. + }
  3433. +
  3434. + if (bic->bfqq[BLK_RW_SYNC]) {
  3435. + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
  3436. + bic->bfqq[BLK_RW_SYNC] = NULL;
  3437. + }
  3438. +}
  3439. +
  3440. +/*
  3441. + * Update the entity prio values; note that the new values will not
  3442. + * be used until the next (re)activation.
  3443. + */
  3444. +static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
  3445. +{
  3446. + struct task_struct *tsk = current;
  3447. + int ioprio_class;
  3448. +
  3449. + if (!bfq_bfqq_prio_changed(bfqq))
  3450. + return;
  3451. +
  3452. + ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
  3453. + switch (ioprio_class) {
  3454. + default:
  3455. + dev_err(bfqq->bfqd->queue->backing_dev_info.dev,
  3456. + "bfq: bad prio %x\n", ioprio_class);
  3457. + case IOPRIO_CLASS_NONE:
  3458. + /*
  3459. + * No prio set, inherit CPU scheduling settings.
  3460. + */
  3461. + bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
  3462. + bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
  3463. + break;
  3464. + case IOPRIO_CLASS_RT:
  3465. + bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
  3466. + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
  3467. + break;
  3468. + case IOPRIO_CLASS_BE:
  3469. + bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
  3470. + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
  3471. + break;
  3472. + case IOPRIO_CLASS_IDLE:
  3473. + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
  3474. + bfqq->entity.new_ioprio = 7;
  3475. + bfq_clear_bfqq_idle_window(bfqq);
  3476. + break;
  3477. + }
  3478. +
  3479. + bfqq->entity.ioprio_changed = 1;
  3480. +
  3481. + bfq_clear_bfqq_prio_changed(bfqq);
  3482. +}
  3483. +
  3484. +static void bfq_changed_ioprio(struct bfq_io_cq *bic)
  3485. +{
  3486. + struct bfq_data *bfqd;
  3487. + struct bfq_queue *bfqq, *new_bfqq;
  3488. + struct bfq_group *bfqg;
  3489. + unsigned long uninitialized_var(flags);
  3490. + int ioprio = bic->icq.ioc->ioprio;
  3491. +
  3492. + bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
  3493. + &flags);
  3494. + /*
  3495. + * This condition may trigger on a newly created bic, be sure to
  3496. + * drop the lock before returning.
  3497. + */
  3498. + if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
  3499. + goto out;
  3500. +
  3501. + bfqq = bic->bfqq[BLK_RW_ASYNC];
  3502. + if (bfqq != NULL) {
  3503. + bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
  3504. + sched_data);
  3505. + new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
  3506. + GFP_ATOMIC);
  3507. + if (new_bfqq != NULL) {
  3508. + bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
  3509. + bfq_log_bfqq(bfqd, bfqq,
  3510. + "changed_ioprio: bfqq %p %d",
  3511. + bfqq, atomic_read(&bfqq->ref));
  3512. + bfq_put_queue(bfqq);
  3513. + }
  3514. + }
  3515. +
  3516. + bfqq = bic->bfqq[BLK_RW_SYNC];
  3517. + if (bfqq != NULL)
  3518. + bfq_mark_bfqq_prio_changed(bfqq);
  3519. +
  3520. + bic->ioprio = ioprio;
  3521. +
  3522. +out:
  3523. + bfq_put_bfqd_unlock(bfqd, &flags);
  3524. +}
  3525. +
  3526. +static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  3527. + pid_t pid, int is_sync)
  3528. +{
  3529. + RB_CLEAR_NODE(&bfqq->entity.rb_node);
  3530. + INIT_LIST_HEAD(&bfqq->fifo);
  3531. +
  3532. + atomic_set(&bfqq->ref, 0);
  3533. + bfqq->bfqd = bfqd;
  3534. +
  3535. + bfq_mark_bfqq_prio_changed(bfqq);
  3536. +
  3537. + if (is_sync) {
  3538. + if (!bfq_class_idle(bfqq))
  3539. + bfq_mark_bfqq_idle_window(bfqq);
  3540. + bfq_mark_bfqq_sync(bfqq);
  3541. + }
  3542. + bfq_mark_bfqq_IO_bound(bfqq);
  3543. +
  3544. + /* Tentative initial value to trade off between thr and lat */
  3545. + bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
  3546. + bfqq->pid = pid;
  3547. +
  3548. + bfqq->wr_coeff = 1;
  3549. + bfqq->last_wr_start_finish = 0;
  3550. + /*
  3551. + * Set to the value for which bfqq will not be deemed as
  3552. + * soft rt when it becomes backlogged.
  3553. + */
  3554. + bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);
  3555. +}
  3556. +
  3557. +static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
  3558. + struct bfq_group *bfqg,
  3559. + int is_sync,
  3560. + struct bfq_io_cq *bic,
  3561. + gfp_t gfp_mask)
  3562. +{
  3563. + struct bfq_queue *bfqq, *new_bfqq = NULL;
  3564. +
  3565. +retry:
  3566. + /* bic always exists here */
  3567. + bfqq = bic_to_bfqq(bic, is_sync);
  3568. +
  3569. + /*
  3570. + * Always try a new alloc if we fall back to the OOM bfqq
  3571. + * originally, since it should just be a temporary situation.
  3572. + */
  3573. + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
  3574. + bfqq = NULL;
  3575. + if (new_bfqq != NULL) {
  3576. + bfqq = new_bfqq;
  3577. + new_bfqq = NULL;
  3578. + } else if (gfp_mask & __GFP_WAIT) {
  3579. + spin_unlock_irq(bfqd->queue->queue_lock);
  3580. + new_bfqq = kmem_cache_alloc_node(bfq_pool,
  3581. + gfp_mask | __GFP_ZERO,
  3582. + bfqd->queue->node);
  3583. + spin_lock_irq(bfqd->queue->queue_lock);
  3584. + if (new_bfqq != NULL)
  3585. + goto retry;
  3586. + } else {
  3587. + bfqq = kmem_cache_alloc_node(bfq_pool,
  3588. + gfp_mask | __GFP_ZERO,
  3589. + bfqd->queue->node);
  3590. + }
  3591. +
  3592. + if (bfqq != NULL) {
  3593. + bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
  3594. + bfq_log_bfqq(bfqd, bfqq, "allocated");
  3595. + } else {
  3596. + bfqq = &bfqd->oom_bfqq;
  3597. + bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
  3598. + }
  3599. +
  3600. + bfq_init_prio_data(bfqq, bic);
  3601. + bfq_init_entity(&bfqq->entity, bfqg);
  3602. + }
  3603. +
  3604. + if (new_bfqq != NULL)
  3605. + kmem_cache_free(bfq_pool, new_bfqq);
  3606. +
  3607. + return bfqq;
  3608. +}
  3609. +
  3610. +static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
  3611. + struct bfq_group *bfqg,
  3612. + int ioprio_class, int ioprio)
  3613. +{
  3614. + switch (ioprio_class) {
  3615. + case IOPRIO_CLASS_RT:
  3616. + return &bfqg->async_bfqq[0][ioprio];
  3617. + case IOPRIO_CLASS_NONE:
  3618. + ioprio = IOPRIO_NORM;
  3619. + /* fall through */
  3620. + case IOPRIO_CLASS_BE:
  3621. + return &bfqg->async_bfqq[1][ioprio];
  3622. + case IOPRIO_CLASS_IDLE:
  3623. + return &bfqg->async_idle_bfqq;
  3624. + default:
  3625. + BUG();
  3626. + }
  3627. +}
  3628. +
  3629. +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
  3630. + struct bfq_group *bfqg, int is_sync,
  3631. + struct bfq_io_cq *bic, gfp_t gfp_mask)
  3632. +{
  3633. + const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
  3634. + const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
  3635. + struct bfq_queue **async_bfqq = NULL;
  3636. + struct bfq_queue *bfqq = NULL;
  3637. +
  3638. + if (!is_sync) {
  3639. + async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
  3640. + ioprio);
  3641. + bfqq = *async_bfqq;
  3642. + }
  3643. +
  3644. + if (bfqq == NULL)
  3645. + bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
  3646. +
  3647. + /*
  3648. + * Pin the queue now that it's allocated, scheduler exit will
  3649. + * prune it.
  3650. + */
  3651. + if (!is_sync && *async_bfqq == NULL) {
  3652. + atomic_inc(&bfqq->ref);
  3653. + bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
  3654. + bfqq, atomic_read(&bfqq->ref));
  3655. + *async_bfqq = bfqq;
  3656. + }
  3657. +
  3658. + atomic_inc(&bfqq->ref);
  3659. + bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
  3660. + atomic_read(&bfqq->ref));
  3661. + return bfqq;
  3662. +}
  3663. +
  3664. +static void bfq_update_io_thinktime(struct bfq_data *bfqd,
  3665. + struct bfq_io_cq *bic)
  3666. +{
  3667. + unsigned long elapsed = jiffies - bic->ttime.last_end_request;
  3668. + unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
  3669. +
  3670. + bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
  3671. + bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
  3672. + bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /
  3673. + bic->ttime.ttime_samples;
  3674. +}
  3675. +
  3676. +static void bfq_update_io_seektime(struct bfq_data *bfqd,
  3677. + struct bfq_queue *bfqq,
  3678. + struct request *rq)
  3679. +{
  3680. + sector_t sdist;
  3681. + u64 total;
  3682. +
  3683. + if (bfqq->last_request_pos < blk_rq_pos(rq))
  3684. + sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
  3685. + else
  3686. + sdist = bfqq->last_request_pos - blk_rq_pos(rq);
  3687. +
  3688. + /*
  3689. + * Don't allow the seek distance to get too large from the
  3690. + * odd fragment, pagein, etc.
  3691. + */
  3692. + if (bfqq->seek_samples == 0) /* first request, not really a seek */
  3693. + sdist = 0;
  3694. + else if (bfqq->seek_samples <= 60) /* second & third seek */
  3695. + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
  3696. + else
  3697. + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
  3698. +
  3699. + bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
  3700. + bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
  3701. + total = bfqq->seek_total + (bfqq->seek_samples/2);
  3702. + do_div(total, bfqq->seek_samples);
  3703. + bfqq->seek_mean = (sector_t)total;
  3704. +
  3705. + bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
  3706. + (u64)bfqq->seek_mean);
  3707. +}
  3708. +
  3709. +/*
  3710. + * Disable idle window if the process thinks too long or seeks so much that
  3711. + * it doesn't matter.
  3712. + */
  3713. +static void bfq_update_idle_window(struct bfq_data *bfqd,
  3714. + struct bfq_queue *bfqq,
  3715. + struct bfq_io_cq *bic)
  3716. +{
  3717. + int enable_idle;
  3718. +
  3719. + /* Don't idle for async or idle io prio class. */
  3720. + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
  3721. + return;
  3722. +
  3723. + enable_idle = bfq_bfqq_idle_window(bfqq);
  3724. +
  3725. + if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
  3726. + bfqd->bfq_slice_idle == 0 ||
  3727. + (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
  3728. + bfqq->wr_coeff == 1))
  3729. + enable_idle = 0;
  3730. + else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
  3731. + if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
  3732. + bfqq->wr_coeff == 1)
  3733. + enable_idle = 0;
  3734. + else
  3735. + enable_idle = 1;
  3736. + }
  3737. + bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
  3738. + enable_idle);
  3739. +
  3740. + if (enable_idle)
  3741. + bfq_mark_bfqq_idle_window(bfqq);
  3742. + else
  3743. + bfq_clear_bfqq_idle_window(bfqq);
  3744. +}
  3745. +
  3746. +/*
  3747. + * Called when a new fs request (rq) is added to bfqq. Check if there's
  3748. + * something we should do about it.
  3749. + */
  3750. +static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  3751. + struct request *rq)
  3752. +{
  3753. + struct bfq_io_cq *bic = RQ_BIC(rq);
  3754. +
  3755. + if (rq->cmd_flags & REQ_META)
  3756. + bfqq->meta_pending++;
  3757. +
  3758. + bfq_update_io_thinktime(bfqd, bic);
  3759. + bfq_update_io_seektime(bfqd, bfqq, rq);
  3760. + if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) {
  3761. + bfq_clear_bfqq_constantly_seeky(bfqq);
  3762. + if (!blk_queue_nonrot(bfqd->queue)) {
  3763. + BUG_ON(!bfqd->const_seeky_busy_in_flight_queues);
  3764. + bfqd->const_seeky_busy_in_flight_queues--;
  3765. + }
  3766. + }
  3767. + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
  3768. + !BFQQ_SEEKY(bfqq))
  3769. + bfq_update_idle_window(bfqd, bfqq, bic);
  3770. +
  3771. + bfq_log_bfqq(bfqd, bfqq,
  3772. + "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
  3773. + bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
  3774. + (long long unsigned)bfqq->seek_mean);
  3775. +
  3776. + bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
  3777. +
  3778. + if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
  3779. + int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
  3780. + blk_rq_sectors(rq) < 32;
  3781. + int budget_timeout = bfq_bfqq_budget_timeout(bfqq);
  3782. +
  3783. + /*
  3784. + * There is just this request queued: if the request
  3785. + * is small and the queue is not to be expired, then
  3786. + * just exit.
  3787. + *
  3788. + * In this way, if the disk is being idled to wait for
  3789. + * a new request from the in-service queue, we avoid
  3790. + * unplugging the device and committing the disk to serve
  3791. + * just a small request. On the contrary, we wait for
  3792. + * the block layer to decide when to unplug the device:
  3793. + * hopefully, new requests will be merged to this one
  3794. + * quickly, then the device will be unplugged and
  3795. + * larger requests will be dispatched.
  3796. + */
  3797. + if (small_req && !budget_timeout)
  3798. + return;
  3799. +
  3800. + /*
  3801. + * A large enough request arrived, or the queue is to
  3802. + * be expired: in both cases disk idling is to be
  3803. + * stopped, so clear wait_request flag and reset
  3804. + * timer.
  3805. + */
  3806. + bfq_clear_bfqq_wait_request(bfqq);
  3807. + del_timer(&bfqd->idle_slice_timer);
  3808. +
  3809. + /*
  3810. + * The queue is not empty, because a new request just
  3811. + * arrived. Hence we can safely expire the queue, in
  3812. + * case of budget timeout, without risking that the
  3813. + * timestamps of the queue are not updated correctly.
  3814. + * See [1] for more details.
  3815. + */
  3816. + if (budget_timeout)
  3817. + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
  3818. +
  3819. + /*
  3820. + * Let the request rip immediately, or let a new queue be
  3821. + * selected if bfqq has just been expired.
  3822. + */
  3823. + __blk_run_queue(bfqd->queue);
  3824. + }
  3825. +}
  3826. +
  3827. +static void bfq_insert_request(struct request_queue *q, struct request *rq)
  3828. +{
  3829. + struct bfq_data *bfqd = q->elevator->elevator_data;
  3830. + struct bfq_queue *bfqq = RQ_BFQQ(rq);
  3831. +
  3832. + assert_spin_locked(bfqd->queue->queue_lock);
  3833. + bfq_init_prio_data(bfqq, RQ_BIC(rq));
  3834. +
  3835. + bfq_add_request(rq);
  3836. +
  3837. + rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
  3838. + list_add_tail(&rq->queuelist, &bfqq->fifo);
  3839. +
  3840. + bfq_rq_enqueued(bfqd, bfqq, rq);
  3841. +}
  3842. +
  3843. +static void bfq_update_hw_tag(struct bfq_data *bfqd)
  3844. +{
  3845. + bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
  3846. + bfqd->rq_in_driver);
  3847. +
  3848. + if (bfqd->hw_tag == 1)
  3849. + return;
  3850. +
  3851. + /*
  3852. + * This sample is valid if the number of outstanding requests
  3853. + * is large enough to allow a queueing behavior. Note that the
  3854. + * sum is not exact, as it's not taking into account deactivated
  3855. + * requests.
  3856. + */
  3857. + if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
  3858. + return;
  3859. +
  3860. + if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
  3861. + return;
  3862. +
  3863. + bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
  3864. + bfqd->max_rq_in_driver = 0;
  3865. + bfqd->hw_tag_samples = 0;
  3866. +}
  3867. +
  3868. +static void bfq_completed_request(struct request_queue *q, struct request *rq)
  3869. +{
  3870. + struct bfq_queue *bfqq = RQ_BFQQ(rq);
  3871. + struct bfq_data *bfqd = bfqq->bfqd;
  3872. + bool sync = bfq_bfqq_sync(bfqq);
  3873. +
  3874. + bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)",
  3875. + blk_rq_sectors(rq), sync);
  3876. +
  3877. + bfq_update_hw_tag(bfqd);
  3878. +
  3879. + BUG_ON(!bfqd->rq_in_driver);
  3880. + BUG_ON(!bfqq->dispatched);
  3881. + bfqd->rq_in_driver--;
  3882. + bfqq->dispatched--;
  3883. +
  3884. + if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
  3885. + bfq_weights_tree_remove(bfqd, &bfqq->entity,
  3886. + &bfqd->queue_weights_tree);
  3887. + if (!blk_queue_nonrot(bfqd->queue)) {
  3888. + BUG_ON(!bfqd->busy_in_flight_queues);
  3889. + bfqd->busy_in_flight_queues--;
  3890. + if (bfq_bfqq_constantly_seeky(bfqq)) {
  3891. + BUG_ON(!bfqd->
  3892. + const_seeky_busy_in_flight_queues);
  3893. + bfqd->const_seeky_busy_in_flight_queues--;
  3894. + }
  3895. + }
  3896. + }
  3897. +
  3898. + if (sync) {
  3899. + bfqd->sync_flight--;
  3900. + RQ_BIC(rq)->ttime.last_end_request = jiffies;
  3901. + }
  3902. +
  3903. + /*
  3904. + * If we are waiting to discover whether the request pattern of the
  3905. + * task associated with the queue is actually isochronous, and
  3906. + * both requisites for this condition to hold are satisfied, then
  3907. + * compute soft_rt_next_start (see the comments to the function
  3908. + * bfq_bfqq_softrt_next_start()).
  3909. + */
  3910. + if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
  3911. + RB_EMPTY_ROOT(&bfqq->sort_list))
  3912. + bfqq->soft_rt_next_start =
  3913. + bfq_bfqq_softrt_next_start(bfqd, bfqq);
  3914. +
  3915. + /*
  3916. + * If this is the in-service queue, check if it needs to be expired,
  3917. + * or if we want to idle in case it has no pending requests.
  3918. + */
  3919. + if (bfqd->in_service_queue == bfqq) {
  3920. + if (bfq_bfqq_budget_new(bfqq))
  3921. + bfq_set_budget_timeout(bfqd);
  3922. +
  3923. + if (bfq_bfqq_must_idle(bfqq)) {
  3924. + bfq_arm_slice_timer(bfqd);
  3925. + goto out;
  3926. + } else if (bfq_may_expire_for_budg_timeout(bfqq))
  3927. + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
  3928. + else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
  3929. + (bfqq->dispatched == 0 ||
  3930. + !bfq_bfqq_must_not_expire(bfqq)))
  3931. + bfq_bfqq_expire(bfqd, bfqq, 0,
  3932. + BFQ_BFQQ_NO_MORE_REQUESTS);
  3933. + }
  3934. +
  3935. + if (!bfqd->rq_in_driver)
  3936. + bfq_schedule_dispatch(bfqd);
  3937. +
  3938. +out:
  3939. + return;
  3940. +}
  3941. +
  3942. +static inline int __bfq_may_queue(struct bfq_queue *bfqq)
  3943. +{
  3944. + if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
  3945. + bfq_clear_bfqq_must_alloc(bfqq);
  3946. + return ELV_MQUEUE_MUST;
  3947. + }
  3948. +
  3949. + return ELV_MQUEUE_MAY;
  3950. +}
  3951. +
  3952. +static int bfq_may_queue(struct request_queue *q, int rw)
  3953. +{
  3954. + struct bfq_data *bfqd = q->elevator->elevator_data;
  3955. + struct task_struct *tsk = current;
  3956. + struct bfq_io_cq *bic;
  3957. + struct bfq_queue *bfqq;
  3958. +
  3959. + /*
  3960. + * Don't force setup of a queue from here, as a call to may_queue
  3961. + * does not necessarily imply that a request actually will be
  3962. + * queued. So just lookup a possibly existing queue, or return
  3963. + * 'may queue' if that fails.
  3964. + */
  3965. + bic = bfq_bic_lookup(bfqd, tsk->io_context);
  3966. + if (bic == NULL)
  3967. + return ELV_MQUEUE_MAY;
  3968. +
  3969. + bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
  3970. + if (bfqq != NULL) {
  3971. + bfq_init_prio_data(bfqq, bic);
  3972. +
  3973. + return __bfq_may_queue(bfqq);
  3974. + }
  3975. +
  3976. + return ELV_MQUEUE_MAY;
  3977. +}
  3978. +
  3979. +/*
  3980. + * Queue lock held here.
  3981. + */
  3982. +static void bfq_put_request(struct request *rq)
  3983. +{
  3984. + struct bfq_queue *bfqq = RQ_BFQQ(rq);
  3985. +
  3986. + if (bfqq != NULL) {
  3987. + const int rw = rq_data_dir(rq);
  3988. +
  3989. + BUG_ON(!bfqq->allocated[rw]);
  3990. + bfqq->allocated[rw]--;
  3991. +
  3992. + rq->elv.priv[0] = NULL;
  3993. + rq->elv.priv[1] = NULL;
  3994. +
  3995. + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
  3996. + bfqq, atomic_read(&bfqq->ref));
  3997. + bfq_put_queue(bfqq);
  3998. + }
  3999. +}
  4000. +
  4001. +static struct bfq_queue *
  4002. +bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
  4003. + struct bfq_queue *bfqq)
  4004. +{
  4005. + bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
  4006. + (long unsigned)bfqq->new_bfqq->pid);
  4007. + bic_set_bfqq(bic, bfqq->new_bfqq, 1);
  4008. + bfq_mark_bfqq_coop(bfqq->new_bfqq);
  4009. + bfq_put_queue(bfqq);
  4010. + return bic_to_bfqq(bic, 1);
  4011. +}
  4012. +
  4013. +/*
  4014. + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
  4015. + * was the last process referring to said bfqq.
  4016. + */
  4017. +static struct bfq_queue *
  4018. +bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
  4019. +{
  4020. + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
  4021. + if (bfqq_process_refs(bfqq) == 1) {
  4022. + bfqq->pid = current->pid;
  4023. + bfq_clear_bfqq_coop(bfqq);
  4024. + bfq_clear_bfqq_split_coop(bfqq);
  4025. + return bfqq;
  4026. + }
  4027. +
  4028. + bic_set_bfqq(bic, NULL, 1);
  4029. +
  4030. + bfq_put_cooperator(bfqq);
  4031. +
  4032. + bfq_put_queue(bfqq);
  4033. + return NULL;
  4034. +}
  4035. +
  4036. +/*
  4037. + * Allocate bfq data structures associated with this request.
  4038. + */
  4039. +static int bfq_set_request(struct request_queue *q, struct request *rq,
  4040. + struct bio *bio, gfp_t gfp_mask)
  4041. +{
  4042. + struct bfq_data *bfqd = q->elevator->elevator_data;
  4043. + struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
  4044. + const int rw = rq_data_dir(rq);
  4045. + const int is_sync = rq_is_sync(rq);
  4046. + struct bfq_queue *bfqq;
  4047. + struct bfq_group *bfqg;
  4048. + unsigned long flags;
  4049. +
  4050. + might_sleep_if(gfp_mask & __GFP_WAIT);
  4051. +
  4052. + bfq_changed_ioprio(bic);
  4053. +
  4054. + spin_lock_irqsave(q->queue_lock, flags);
  4055. +
  4056. + if (bic == NULL)
  4057. + goto queue_fail;
  4058. +
  4059. + bfqg = bfq_bic_update_cgroup(bic);
  4060. +
  4061. +new_queue:
  4062. + bfqq = bic_to_bfqq(bic, is_sync);
  4063. + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
  4064. + bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
  4065. + bic_set_bfqq(bic, bfqq, is_sync);
  4066. + } else {
  4067. + /*
  4068. + * If the queue was seeky for too long, break it apart.
  4069. + */
  4070. + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
  4071. + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
  4072. + bfqq = bfq_split_bfqq(bic, bfqq);
  4073. + if (!bfqq)
  4074. + goto new_queue;
  4075. + }
  4076. +
  4077. + /*
  4078. + * Check to see if this queue is scheduled to merge with
  4079. + * another closely cooperating queue. The merging of queues
  4080. + * happens here as it must be done in process context.
  4081. + * The reference on new_bfqq was taken in merge_bfqqs.
  4082. + */
  4083. + if (bfqq->new_bfqq != NULL)
  4084. + bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
  4085. + }
  4086. +
  4087. + bfqq->allocated[rw]++;
  4088. + atomic_inc(&bfqq->ref);
  4089. + bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
  4090. + atomic_read(&bfqq->ref));
  4091. +
  4092. + rq->elv.priv[0] = bic;
  4093. + rq->elv.priv[1] = bfqq;
  4094. +
  4095. + spin_unlock_irqrestore(q->queue_lock, flags);
  4096. +
  4097. + return 0;
  4098. +
  4099. +queue_fail:
  4100. + bfq_schedule_dispatch(bfqd);
  4101. + spin_unlock_irqrestore(q->queue_lock, flags);
  4102. +
  4103. + return 1;
  4104. +}
  4105. +
  4106. +static void bfq_kick_queue(struct work_struct *work)
  4107. +{
  4108. + struct bfq_data *bfqd =
  4109. + container_of(work, struct bfq_data, unplug_work);
  4110. + struct request_queue *q = bfqd->queue;
  4111. +
  4112. + spin_lock_irq(q->queue_lock);
  4113. + __blk_run_queue(q);
  4114. + spin_unlock_irq(q->queue_lock);
  4115. +}
  4116. +
  4117. +/*
  4118. + * Handler of the expiration of the timer running if the in-service queue
  4119. + * is idling inside its time slice.
  4120. + */
  4121. +static void bfq_idle_slice_timer(unsigned long data)
  4122. +{
  4123. + struct bfq_data *bfqd = (struct bfq_data *)data;
  4124. + struct bfq_queue *bfqq;
  4125. + unsigned long flags;
  4126. + enum bfqq_expiration reason;
  4127. +
  4128. + spin_lock_irqsave(bfqd->queue->queue_lock, flags);
  4129. +
  4130. + bfqq = bfqd->in_service_queue;
  4131. + /*
  4132. + * Theoretical race here: the in-service queue can be NULL or
  4133. + * different from the queue that was idling if the timer handler
  4134. + * spins on the queue_lock and a new request arrives for the
  4135. + * current queue and there is a full dispatch cycle that changes
  4136. + * the in-service queue. This can hardly happen, but in the worst
  4137. + * case we just expire a queue too early.
  4138. + */
  4139. + if (bfqq != NULL) {
  4140. + bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
  4141. + if (bfq_bfqq_budget_timeout(bfqq))
  4142. + /*
  4143. + * Also here the queue can be safely expired
  4144. + * for budget timeout without wasting
  4145. + * guarantees
  4146. + */
  4147. + reason = BFQ_BFQQ_BUDGET_TIMEOUT;
  4148. + else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
  4149. + /*
  4150. + * The queue may not be empty upon timer expiration,
  4151. + * because we may not disable the timer when the
  4152. + * first request of the in-service queue arrives
  4153. + * during disk idling.
  4154. + */
  4155. + reason = BFQ_BFQQ_TOO_IDLE;
  4156. + else
  4157. + goto schedule_dispatch;
  4158. +
  4159. + bfq_bfqq_expire(bfqd, bfqq, 1, reason);
  4160. + }
  4161. +
  4162. +schedule_dispatch:
  4163. + bfq_schedule_dispatch(bfqd);
  4164. +
  4165. + spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
  4166. +}
  4167. +
  4168. +static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
  4169. +{
  4170. + del_timer_sync(&bfqd->idle_slice_timer);
  4171. + cancel_work_sync(&bfqd->unplug_work);
  4172. +}
  4173. +
  4174. +static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
  4175. + struct bfq_queue **bfqq_ptr)
  4176. +{
  4177. + struct bfq_group *root_group = bfqd->root_group;
  4178. + struct bfq_queue *bfqq = *bfqq_ptr;
  4179. +
  4180. + bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
  4181. + if (bfqq != NULL) {
  4182. + bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
  4183. + bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
  4184. + bfqq, atomic_read(&bfqq->ref));
  4185. + bfq_put_queue(bfqq);
  4186. + *bfqq_ptr = NULL;
  4187. + }
  4188. +}
  4189. +
  4190. +/*
  4191. + * Release all the bfqg references to its async queues. If we are
  4192. + * deallocating the group these queues may still contain requests, so
  4193. + * we reparent them to the root cgroup (i.e., the only one that will
  4194. + * exist for sure until all the requests on a device are gone).
  4195. + */
  4196. +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
  4197. +{
  4198. + int i, j;
  4199. +
  4200. + for (i = 0; i < 2; i++)
  4201. + for (j = 0; j < IOPRIO_BE_NR; j++)
  4202. + __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
  4203. +
  4204. + __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
  4205. +}
  4206. +
  4207. +static void bfq_exit_queue(struct elevator_queue *e)
  4208. +{
  4209. + struct bfq_data *bfqd = e->elevator_data;
  4210. + struct request_queue *q = bfqd->queue;
  4211. + struct bfq_queue *bfqq, *n;
  4212. +
  4213. + bfq_shutdown_timer_wq(bfqd);
  4214. +
  4215. + spin_lock_irq(q->queue_lock);
  4216. +
  4217. + BUG_ON(bfqd->in_service_queue != NULL);
  4218. + list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
  4219. + bfq_deactivate_bfqq(bfqd, bfqq, 0);
  4220. +
  4221. + bfq_disconnect_groups(bfqd);
  4222. + spin_unlock_irq(q->queue_lock);
  4223. +
  4224. + bfq_shutdown_timer_wq(bfqd);
  4225. +
  4226. + synchronize_rcu();
  4227. +
  4228. + BUG_ON(timer_pending(&bfqd->idle_slice_timer));
  4229. +
  4230. + bfq_free_root_group(bfqd);
  4231. + kfree(bfqd);
  4232. +}
  4233. +
  4234. +static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
  4235. +{
  4236. + struct bfq_group *bfqg;
  4237. + struct bfq_data *bfqd;
  4238. + struct elevator_queue *eq;
  4239. +
  4240. + eq = elevator_alloc(q, e);
  4241. + if (eq == NULL)
  4242. + return -ENOMEM;
  4243. +
  4244. + bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
  4245. + if (bfqd == NULL) {
  4246. + kobject_put(&eq->kobj);
  4247. + return -ENOMEM;
  4248. + }
  4249. + eq->elevator_data = bfqd;
  4250. +
  4251. + /*
  4252. + * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
  4253. + * Grab a permanent reference to it, so that the normal code flow
  4254. + * will not attempt to free it.
  4255. + */
  4256. + bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
  4257. + atomic_inc(&bfqd->oom_bfqq.ref);
  4258. +
  4259. + bfqd->queue = q;
  4260. +
  4261. + spin_lock_irq(q->queue_lock);
  4262. + q->elevator = eq;
  4263. + spin_unlock_irq(q->queue_lock);
  4264. +
  4265. + bfqg = bfq_alloc_root_group(bfqd, q->node);
  4266. + if (bfqg == NULL) {
  4267. + kfree(bfqd);
  4268. + kobject_put(&eq->kobj);
  4269. + return -ENOMEM;
  4270. + }
  4271. +
  4272. + bfqd->root_group = bfqg;
  4273. +#ifdef CONFIG_CGROUP_BFQIO
  4274. + bfqd->active_numerous_groups = 0;
  4275. +#endif
  4276. +
  4277. + init_timer(&bfqd->idle_slice_timer);
  4278. + bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
  4279. + bfqd->idle_slice_timer.data = (unsigned long)bfqd;
  4280. +
  4281. + bfqd->rq_pos_tree = RB_ROOT;
  4282. + bfqd->queue_weights_tree = RB_ROOT;
  4283. + bfqd->group_weights_tree = RB_ROOT;
  4284. +
  4285. + INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
  4286. +
  4287. + INIT_LIST_HEAD(&bfqd->active_list);
  4288. + INIT_LIST_HEAD(&bfqd->idle_list);
  4289. +
  4290. + bfqd->hw_tag = -1;
  4291. +
  4292. + bfqd->bfq_max_budget = bfq_default_max_budget;
  4293. +
  4294. + bfqd->bfq_quantum = bfq_quantum;
  4295. + bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
  4296. + bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
  4297. + bfqd->bfq_back_max = bfq_back_max;
  4298. + bfqd->bfq_back_penalty = bfq_back_penalty;
  4299. + bfqd->bfq_slice_idle = bfq_slice_idle;
  4300. + bfqd->bfq_class_idle_last_service = 0;
  4301. + bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
  4302. + bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
  4303. + bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
  4304. +
  4305. + bfqd->bfq_coop_thresh = 2;
  4306. + bfqd->bfq_failed_cooperations = 7000;
  4307. + bfqd->bfq_requests_within_timer = 120;
  4308. +
  4309. + bfqd->low_latency = true;
  4310. +
  4311. + bfqd->bfq_wr_coeff = 20;
  4312. + bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
  4313. + bfqd->bfq_wr_max_time = 0;
  4314. + bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
  4315. + bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
  4316. + bfqd->bfq_wr_max_softrt_rate = 7000; /*
  4317. + * Approximate rate required
  4318. + * to playback or record a
  4319. + * high-definition compressed
  4320. + * video.
  4321. + */
  4322. + bfqd->wr_busy_queues = 0;
  4323. + bfqd->busy_in_flight_queues = 0;
  4324. + bfqd->const_seeky_busy_in_flight_queues = 0;
  4325. +
  4326. + /*
  4327. + * Begin by assuming, optimistically, that the device peak rate is
  4328. + * equal to the highest reference rate.
  4329. + */
  4330. + bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
  4331. + T_fast[blk_queue_nonrot(bfqd->queue)];
  4332. + bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)];
  4333. + bfqd->device_speed = BFQ_BFQD_FAST;
  4334. +
  4335. + return 0;
  4336. +}
  4337. +
  4338. +static void bfq_slab_kill(void)
  4339. +{
  4340. + if (bfq_pool != NULL)
  4341. + kmem_cache_destroy(bfq_pool);
  4342. +}
  4343. +
  4344. +static int __init bfq_slab_setup(void)
  4345. +{
  4346. + bfq_pool = KMEM_CACHE(bfq_queue, 0);
  4347. + if (bfq_pool == NULL)
  4348. + return -ENOMEM;
  4349. + return 0;
  4350. +}
  4351. +
  4352. +static ssize_t bfq_var_show(unsigned int var, char *page)
  4353. +{
  4354. + return sprintf(page, "%d\n", var);
  4355. +}
  4356. +
  4357. +static ssize_t bfq_var_store(unsigned long *var, const char *page,
  4358. + size_t count)
  4359. +{
  4360. + unsigned long new_val;
  4361. + int ret = kstrtoul(page, 10, &new_val);
  4362. +
  4363. + if (ret == 0)
  4364. + *var = new_val;
  4365. +
  4366. + return count;
  4367. +}
  4368. +
  4369. +static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page)
  4370. +{
  4371. + struct bfq_data *bfqd = e->elevator_data;
  4372. + return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ?
  4373. + jiffies_to_msecs(bfqd->bfq_wr_max_time) :
  4374. + jiffies_to_msecs(bfq_wr_duration(bfqd)));
  4375. +}
  4376. +
  4377. +static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
  4378. +{
  4379. + struct bfq_queue *bfqq;
  4380. + struct bfq_data *bfqd = e->elevator_data;
  4381. + ssize_t num_char = 0;
  4382. +
  4383. + num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
  4384. + bfqd->queued);
  4385. +
  4386. + spin_lock_irq(bfqd->queue->queue_lock);
  4387. +
  4388. + num_char += sprintf(page + num_char, "Active:\n");
  4389. + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
  4390. + num_char += sprintf(page + num_char,
  4391. + "pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n",
  4392. + bfqq->pid,
  4393. + bfqq->entity.weight,
  4394. + bfqq->queued[0],
  4395. + bfqq->queued[1],
  4396. + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
  4397. + jiffies_to_msecs(bfqq->wr_cur_max_time));
  4398. + }
  4399. +
  4400. + num_char += sprintf(page + num_char, "Idle:\n");
  4401. + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
  4402. + num_char += sprintf(page + num_char,
  4403. + "pid%d: weight %hu, dur %d/%u\n",
  4404. + bfqq->pid,
  4405. + bfqq->entity.weight,
  4406. + jiffies_to_msecs(jiffies -
  4407. + bfqq->last_wr_start_finish),
  4408. + jiffies_to_msecs(bfqq->wr_cur_max_time));
  4409. + }
  4410. +
  4411. + spin_unlock_irq(bfqd->queue->queue_lock);
  4412. +
  4413. + return num_char;
  4414. +}
  4415. +
  4416. +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
  4417. +static ssize_t __FUNC(struct elevator_queue *e, char *page) \
  4418. +{ \
  4419. + struct bfq_data *bfqd = e->elevator_data; \
  4420. + unsigned int __data = __VAR; \
  4421. + if (__CONV) \
  4422. + __data = jiffies_to_msecs(__data); \
  4423. + return bfq_var_show(__data, (page)); \
  4424. +}
  4425. +SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
  4426. +SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
  4427. +SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
  4428. +SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
  4429. +SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
  4430. +SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
  4431. +SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
  4432. +SHOW_FUNCTION(bfq_max_budget_async_rq_show,
  4433. + bfqd->bfq_max_budget_async_rq, 0);
  4434. +SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
  4435. +SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
  4436. +SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
  4437. +SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);
  4438. +SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);
  4439. +SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1);
  4440. +SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,
  4441. + 1);
  4442. +SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);
  4443. +#undef SHOW_FUNCTION
  4444. +
  4445. +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
  4446. +static ssize_t \
  4447. +__FUNC(struct elevator_queue *e, const char *page, size_t count) \
  4448. +{ \
  4449. + struct bfq_data *bfqd = e->elevator_data; \
  4450. + unsigned long uninitialized_var(__data); \
  4451. + int ret = bfq_var_store(&__data, (page), count); \
  4452. + if (__data < (MIN)) \
  4453. + __data = (MIN); \
  4454. + else if (__data > (MAX)) \
  4455. + __data = (MAX); \
  4456. + if (__CONV) \
  4457. + *(__PTR) = msecs_to_jiffies(__data); \
  4458. + else \
  4459. + *(__PTR) = __data; \
  4460. + return ret; \
  4461. +}
  4462. +STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
  4463. +STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
  4464. + INT_MAX, 1);
  4465. +STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
  4466. + INT_MAX, 1);
  4467. +STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
  4468. +STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
  4469. + INT_MAX, 0);
  4470. +STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
  4471. +STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
  4472. + 1, INT_MAX, 0);
  4473. +STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
  4474. + INT_MAX, 1);
  4475. +STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);
  4476. +STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);
  4477. +STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,
  4478. + 1);
  4479. +STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0,
  4480. + INT_MAX, 1);
  4481. +STORE_FUNCTION(bfq_wr_min_inter_arr_async_store,
  4482. + &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1);
  4483. +STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,
  4484. + INT_MAX, 0);
  4485. +#undef STORE_FUNCTION
  4486. +
  4487. +/* do nothing for the moment */
  4488. +static ssize_t bfq_weights_store(struct elevator_queue *e,
  4489. + const char *page, size_t count)
  4490. +{
  4491. + return count;
  4492. +}
  4493. +
  4494. +static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
  4495. +{
  4496. + u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
  4497. +
  4498. + if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
  4499. + return bfq_calc_max_budget(bfqd->peak_rate, timeout);
  4500. + else
  4501. + return bfq_default_max_budget;
  4502. +}
  4503. +
  4504. +static ssize_t bfq_max_budget_store(struct elevator_queue *e,
  4505. + const char *page, size_t count)
  4506. +{
  4507. + struct bfq_data *bfqd = e->elevator_data;
  4508. + unsigned long uninitialized_var(__data);
  4509. + int ret = bfq_var_store(&__data, (page), count);
  4510. +
  4511. + if (__data == 0)
  4512. + bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
  4513. + else {
  4514. + if (__data > INT_MAX)
  4515. + __data = INT_MAX;
  4516. + bfqd->bfq_max_budget = __data;
  4517. + }
  4518. +
  4519. + bfqd->bfq_user_max_budget = __data;
  4520. +
  4521. + return ret;
  4522. +}
  4523. +
  4524. +static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
  4525. + const char *page, size_t count)
  4526. +{
  4527. + struct bfq_data *bfqd = e->elevator_data;
  4528. + unsigned long uninitialized_var(__data);
  4529. + int ret = bfq_var_store(&__data, (page), count);
  4530. +
  4531. + if (__data < 1)
  4532. + __data = 1;
  4533. + else if (__data > INT_MAX)
  4534. + __data = INT_MAX;
  4535. +
  4536. + bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
  4537. + if (bfqd->bfq_user_max_budget == 0)
  4538. + bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
  4539. +
  4540. + return ret;
  4541. +}
  4542. +
  4543. +static ssize_t bfq_low_latency_store(struct elevator_queue *e,
  4544. + const char *page, size_t count)
  4545. +{
  4546. + struct bfq_data *bfqd = e->elevator_data;
  4547. + unsigned long uninitialized_var(__data);
  4548. + int ret = bfq_var_store(&__data, (page), count);
  4549. +
  4550. + if (__data > 1)
  4551. + __data = 1;
  4552. + if (__data == 0 && bfqd->low_latency != 0)
  4553. + bfq_end_wr(bfqd);
  4554. + bfqd->low_latency = __data;
  4555. +
  4556. + return ret;
  4557. +}
  4558. +
  4559. +#define BFQ_ATTR(name) \
  4560. + __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
  4561. +
  4562. +static struct elv_fs_entry bfq_attrs[] = {
  4563. + BFQ_ATTR(quantum),
  4564. + BFQ_ATTR(fifo_expire_sync),
  4565. + BFQ_ATTR(fifo_expire_async),
  4566. + BFQ_ATTR(back_seek_max),
  4567. + BFQ_ATTR(back_seek_penalty),
  4568. + BFQ_ATTR(slice_idle),
  4569. + BFQ_ATTR(max_budget),
  4570. + BFQ_ATTR(max_budget_async_rq),
  4571. + BFQ_ATTR(timeout_sync),
  4572. + BFQ_ATTR(timeout_async),
  4573. + BFQ_ATTR(low_latency),
  4574. + BFQ_ATTR(wr_coeff),
  4575. + BFQ_ATTR(wr_max_time),
  4576. + BFQ_ATTR(wr_rt_max_time),
  4577. + BFQ_ATTR(wr_min_idle_time),
  4578. + BFQ_ATTR(wr_min_inter_arr_async),
  4579. + BFQ_ATTR(wr_max_softrt_rate),
  4580. + BFQ_ATTR(weights),
  4581. + __ATTR_NULL
  4582. +};
  4583. +
  4584. +static struct elevator_type iosched_bfq = {
  4585. + .ops = {
  4586. + .elevator_merge_fn = bfq_merge,
  4587. + .elevator_merged_fn = bfq_merged_request,
  4588. + .elevator_merge_req_fn = bfq_merged_requests,
  4589. + .elevator_allow_merge_fn = bfq_allow_merge,
  4590. + .elevator_dispatch_fn = bfq_dispatch_requests,
  4591. + .elevator_add_req_fn = bfq_insert_request,
  4592. + .elevator_activate_req_fn = bfq_activate_request,
  4593. + .elevator_deactivate_req_fn = bfq_deactivate_request,
  4594. + .elevator_completed_req_fn = bfq_completed_request,
  4595. + .elevator_former_req_fn = elv_rb_former_request,
  4596. + .elevator_latter_req_fn = elv_rb_latter_request,
  4597. + .elevator_init_icq_fn = bfq_init_icq,
  4598. + .elevator_exit_icq_fn = bfq_exit_icq,
  4599. + .elevator_set_req_fn = bfq_set_request,
  4600. + .elevator_put_req_fn = bfq_put_request,
  4601. + .elevator_may_queue_fn = bfq_may_queue,
  4602. + .elevator_init_fn = bfq_init_queue,
  4603. + .elevator_exit_fn = bfq_exit_queue,
  4604. + },
  4605. + .icq_size = sizeof(struct bfq_io_cq),
  4606. + .icq_align = __alignof__(struct bfq_io_cq),
  4607. + .elevator_attrs = bfq_attrs,
  4608. + .elevator_name = "bfq",
  4609. + .elevator_owner = THIS_MODULE,
  4610. +};
  4611. +
  4612. +static int __init bfq_init(void)
  4613. +{
  4614. + /*
  4615. + * Can be 0 on HZ < 1000 setups.
  4616. + */
  4617. + if (bfq_slice_idle == 0)
  4618. + bfq_slice_idle = 1;
  4619. +
  4620. + if (bfq_timeout_async == 0)
  4621. + bfq_timeout_async = 1;
  4622. +
  4623. + if (bfq_slab_setup())
  4624. + return -ENOMEM;
  4625. +
  4626. + /*
  4627. + * Times to load large popular applications for the typical systems
  4628. + * installed on the reference devices (see the comments before the
  4629. + * definitions of the two arrays).
  4630. + */
  4631. + T_slow[0] = msecs_to_jiffies(2600);
  4632. + T_slow[1] = msecs_to_jiffies(1000);
  4633. + T_fast[0] = msecs_to_jiffies(5500);
  4634. + T_fast[1] = msecs_to_jiffies(2000);
  4635. +
  4636. + /*
  4637. + * Thresholds that determine the switch between speed classes (see
  4638. + * the comments before the definition of the array).
  4639. + */
  4640. + device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2;
  4641. + device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2;
  4642. +
  4643. + elv_register(&iosched_bfq);
  4644. + pr_info("BFQ I/O-scheduler version: v7r5");
  4645. +
  4646. + return 0;
  4647. +}
  4648. +
  4649. +static void __exit bfq_exit(void)
  4650. +{
  4651. + elv_unregister(&iosched_bfq);
  4652. + bfq_slab_kill();
  4653. +}
  4654. +
  4655. +module_init(bfq_init);
  4656. +module_exit(bfq_exit);
  4657. +
  4658. +MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
  4659. +MODULE_LICENSE("GPL");
  4660. diff --git a/block/bfq-sched.c b/block/bfq-sched.c
  4661. new file mode 100644
  4662. index 0000000..c4831b7
  4663. --- /dev/null
  4664. +++ b/block/bfq-sched.c
  4665. @@ -0,0 +1,1207 @@
  4666. +/*
  4667. + * BFQ: Hierarchical B-WF2Q+ scheduler.
  4668. + *
  4669. + * Based on ideas and code from CFQ:
  4670. + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  4671. + *
  4672. + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  4673. + * Paolo Valente <paolo.valente@unimore.it>
  4674. + *
  4675. + * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
  4676. + */
  4677. +
  4678. +#ifdef CONFIG_CGROUP_BFQIO
  4679. +#define for_each_entity(entity) \
  4680. + for (; entity != NULL; entity = entity->parent)
  4681. +
  4682. +#define for_each_entity_safe(entity, parent) \
  4683. + for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
  4684. +
  4685. +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
  4686. + int extract,
  4687. + struct bfq_data *bfqd);
  4688. +
  4689. +static inline void bfq_update_budget(struct bfq_entity *next_in_service)
  4690. +{
  4691. + struct bfq_entity *bfqg_entity;
  4692. + struct bfq_group *bfqg;
  4693. + struct bfq_sched_data *group_sd;
  4694. +
  4695. + BUG_ON(next_in_service == NULL);
  4696. +
  4697. + group_sd = next_in_service->sched_data;
  4698. +
  4699. + bfqg = container_of(group_sd, struct bfq_group, sched_data);
  4700. + /*
  4701. + * bfq_group's my_entity field is not NULL only if the group
  4702. + * is not the root group. We must not touch the root entity
  4703. + * as it must never become an in-service entity.
  4704. + */
  4705. + bfqg_entity = bfqg->my_entity;
  4706. + if (bfqg_entity != NULL)
  4707. + bfqg_entity->budget = next_in_service->budget;
  4708. +}
  4709. +
  4710. +static int bfq_update_next_in_service(struct bfq_sched_data *sd)
  4711. +{
  4712. + struct bfq_entity *next_in_service;
  4713. +
  4714. + if (sd->in_service_entity != NULL)
  4715. + /* will update/requeue at the end of service */
  4716. + return 0;
  4717. +
  4718. + /*
  4719. + * NOTE: this can be improved in many ways, such as returning
  4720. + * 1 (and thus propagating upwards the update) only when the
  4721. + * budget changes, or caching the bfqq that will be scheduled
  4722. + * next from this subtree. By now we worry more about
  4723. + * correctness than about performance...
  4724. + */
  4725. + next_in_service = bfq_lookup_next_entity(sd, 0, NULL);
  4726. + sd->next_in_service = next_in_service;
  4727. +
  4728. + if (next_in_service != NULL)
  4729. + bfq_update_budget(next_in_service);
  4730. +
  4731. + return 1;
  4732. +}
  4733. +
  4734. +static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
  4735. + struct bfq_entity *entity)
  4736. +{
  4737. + BUG_ON(sd->next_in_service != entity);
  4738. +}
  4739. +#else
  4740. +#define for_each_entity(entity) \
  4741. + for (; entity != NULL; entity = NULL)
  4742. +
  4743. +#define for_each_entity_safe(entity, parent) \
  4744. + for (parent = NULL; entity != NULL; entity = parent)
  4745. +
  4746. +static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)
  4747. +{
  4748. + return 0;
  4749. +}
  4750. +
  4751. +static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
  4752. + struct bfq_entity *entity)
  4753. +{
  4754. +}
  4755. +
  4756. +static inline void bfq_update_budget(struct bfq_entity *next_in_service)
  4757. +{
  4758. +}
  4759. +#endif
  4760. +
  4761. +/*
  4762. + * Shift for timestamp calculations. This actually limits the maximum
  4763. + * service allowed in one timestamp delta (small shift values increase it),
  4764. + * the maximum total weight that can be used for the queues in the system
  4765. + * (big shift values increase it), and the period of virtual time
  4766. + * wraparounds.
  4767. + */
  4768. +#define WFQ_SERVICE_SHIFT 22
  4769. +
  4770. +/**
  4771. + * bfq_gt - compare two timestamps.
  4772. + * @a: first ts.
  4773. + * @b: second ts.
  4774. + *
  4775. + * Return @a > @b, dealing with wrapping correctly.
  4776. + */
  4777. +static inline int bfq_gt(u64 a, u64 b)
  4778. +{
  4779. + return (s64)(a - b) > 0;
  4780. +}
  4781. +
  4782. +static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
  4783. +{
  4784. + struct bfq_queue *bfqq = NULL;
  4785. +
  4786. + BUG_ON(entity == NULL);
  4787. +
  4788. + if (entity->my_sched_data == NULL)
  4789. + bfqq = container_of(entity, struct bfq_queue, entity);
  4790. +
  4791. + return bfqq;
  4792. +}
  4793. +
  4794. +
  4795. +/**
  4796. + * bfq_delta - map service into the virtual time domain.
  4797. + * @service: amount of service.
  4798. + * @weight: scale factor (weight of an entity or weight sum).
  4799. + */
  4800. +static inline u64 bfq_delta(unsigned long service,
  4801. + unsigned long weight)
  4802. +{
  4803. + u64 d = (u64)service << WFQ_SERVICE_SHIFT;
  4804. +
  4805. + do_div(d, weight);
  4806. + return d;
  4807. +}
  4808. +
  4809. +/**
  4810. + * bfq_calc_finish - assign the finish time to an entity.
  4811. + * @entity: the entity to act upon.
  4812. + * @service: the service to be charged to the entity.
  4813. + */
  4814. +static inline void bfq_calc_finish(struct bfq_entity *entity,
  4815. + unsigned long service)
  4816. +{
  4817. + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
  4818. +
  4819. + BUG_ON(entity->weight == 0);
  4820. +
  4821. + entity->finish = entity->start +
  4822. + bfq_delta(service, entity->weight);
  4823. +
  4824. + if (bfqq != NULL) {
  4825. + bfq_log_bfqq(bfqq->bfqd, bfqq,
  4826. + "calc_finish: serv %lu, w %d",
  4827. + service, entity->weight);
  4828. + bfq_log_bfqq(bfqq->bfqd, bfqq,
  4829. + "calc_finish: start %llu, finish %llu, delta %llu",
  4830. + entity->start, entity->finish,
  4831. + bfq_delta(service, entity->weight));
  4832. + }
  4833. +}
  4834. +
  4835. +/**
  4836. + * bfq_entity_of - get an entity from a node.
  4837. + * @node: the node field of the entity.
  4838. + *
  4839. + * Convert a node pointer to the relative entity. This is used only
  4840. + * to simplify the logic of some functions and not as the generic
  4841. + * conversion mechanism because, e.g., in the tree walking functions,
  4842. + * the check for a %NULL value would be redundant.
  4843. + */
  4844. +static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
  4845. +{
  4846. + struct bfq_entity *entity = NULL;
  4847. +
  4848. + if (node != NULL)
  4849. + entity = rb_entry(node, struct bfq_entity, rb_node);
  4850. +
  4851. + return entity;
  4852. +}
  4853. +
  4854. +/**
  4855. + * bfq_extract - remove an entity from a tree.
  4856. + * @root: the tree root.
  4857. + * @entity: the entity to remove.
  4858. + */
  4859. +static inline void bfq_extract(struct rb_root *root,
  4860. + struct bfq_entity *entity)
  4861. +{
  4862. + BUG_ON(entity->tree != root);
  4863. +
  4864. + entity->tree = NULL;
  4865. + rb_erase(&entity->rb_node, root);
  4866. +}
  4867. +
  4868. +/**
  4869. + * bfq_idle_extract - extract an entity from the idle tree.
  4870. + * @st: the service tree of the owning @entity.
  4871. + * @entity: the entity being removed.
  4872. + */
  4873. +static void bfq_idle_extract(struct bfq_service_tree *st,
  4874. + struct bfq_entity *entity)
  4875. +{
  4876. + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
  4877. + struct rb_node *next;
  4878. +
  4879. + BUG_ON(entity->tree != &st->idle);
  4880. +
  4881. + if (entity == st->first_idle) {
  4882. + next = rb_next(&entity->rb_node);
  4883. + st->first_idle = bfq_entity_of(next);
  4884. + }
  4885. +
  4886. + if (entity == st->last_idle) {
  4887. + next = rb_prev(&entity->rb_node);
  4888. + st->last_idle = bfq_entity_of(next);
  4889. + }
  4890. +
  4891. + bfq_extract(&st->idle, entity);
  4892. +
  4893. + if (bfqq != NULL)
  4894. + list_del(&bfqq->bfqq_list);
  4895. +}
  4896. +
  4897. +/**
  4898. + * bfq_insert - generic tree insertion.
  4899. + * @root: tree root.
  4900. + * @entity: entity to insert.
  4901. + *
  4902. + * This is used for the idle and the active tree, since they are both
  4903. + * ordered by finish time.
  4904. + */
  4905. +static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
  4906. +{
  4907. + struct bfq_entity *entry;
  4908. + struct rb_node **node = &root->rb_node;
  4909. + struct rb_node *parent = NULL;
  4910. +
  4911. + BUG_ON(entity->tree != NULL);
  4912. +
  4913. + while (*node != NULL) {
  4914. + parent = *node;
  4915. + entry = rb_entry(parent, struct bfq_entity, rb_node);
  4916. +
  4917. + if (bfq_gt(entry->finish, entity->finish))
  4918. + node = &parent->rb_left;
  4919. + else
  4920. + node = &parent->rb_right;
  4921. + }
  4922. +
  4923. + rb_link_node(&entity->rb_node, parent, node);
  4924. + rb_insert_color(&entity->rb_node, root);
  4925. +
  4926. + entity->tree = root;
  4927. +}
  4928. +
  4929. +/**
  4930. + * bfq_update_min - update the min_start field of a entity.
  4931. + * @entity: the entity to update.
  4932. + * @node: one of its children.
  4933. + *
  4934. + * This function is called when @entity may store an invalid value for
  4935. + * min_start due to updates to the active tree. The function assumes
  4936. + * that the subtree rooted at @node (which may be its left or its right
  4937. + * child) has a valid min_start value.
  4938. + */
  4939. +static inline void bfq_update_min(struct bfq_entity *entity,
  4940. + struct rb_node *node)
  4941. +{
  4942. + struct bfq_entity *child;
  4943. +
  4944. + if (node != NULL) {
  4945. + child = rb_entry(node, struct bfq_entity, rb_node);
  4946. + if (bfq_gt(entity->min_start, child->min_start))
  4947. + entity->min_start = child->min_start;
  4948. + }
  4949. +}
  4950. +
  4951. +/**
  4952. + * bfq_update_active_node - recalculate min_start.
  4953. + * @node: the node to update.
  4954. + *
  4955. + * @node may have changed position or one of its children may have moved,
  4956. + * this function updates its min_start value. The left and right subtrees
  4957. + * are assumed to hold a correct min_start value.
  4958. + */
  4959. +static inline void bfq_update_active_node(struct rb_node *node)
  4960. +{
  4961. + struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
  4962. +
  4963. + entity->min_start = entity->start;
  4964. + bfq_update_min(entity, node->rb_right);
  4965. + bfq_update_min(entity, node->rb_left);
  4966. +}
  4967. +
  4968. +/**
  4969. + * bfq_update_active_tree - update min_start for the whole active tree.
  4970. + * @node: the starting node.
  4971. + *
  4972. + * @node must be the deepest modified node after an update. This function
  4973. + * updates its min_start using the values held by its children, assuming
  4974. + * that they did not change, and then updates all the nodes that may have
  4975. + * changed in the path to the root. The only nodes that may have changed
  4976. + * are the ones in the path or their siblings.
  4977. + */
  4978. +static void bfq_update_active_tree(struct rb_node *node)
  4979. +{
  4980. + struct rb_node *parent;
  4981. +
  4982. +up:
  4983. + bfq_update_active_node(node);
  4984. +
  4985. + parent = rb_parent(node);
  4986. + if (parent == NULL)
  4987. + return;
  4988. +
  4989. + if (node == parent->rb_left && parent->rb_right != NULL)
  4990. + bfq_update_active_node(parent->rb_right);
  4991. + else if (parent->rb_left != NULL)
  4992. + bfq_update_active_node(parent->rb_left);
  4993. +
  4994. + node = parent;
  4995. + goto up;
  4996. +}
  4997. +
  4998. +static void bfq_weights_tree_add(struct bfq_data *bfqd,
  4999. + struct bfq_entity *entity,
  5000. + struct rb_root *root);
  5001. +
  5002. +static void bfq_weights_tree_remove(struct bfq_data *bfqd,
  5003. + struct bfq_entity *entity,
  5004. + struct rb_root *root);
  5005. +
  5006. +
  5007. +/**
  5008. + * bfq_active_insert - insert an entity in the active tree of its
  5009. + * group/device.
  5010. + * @st: the service tree of the entity.
  5011. + * @entity: the entity being inserted.
  5012. + *
  5013. + * The active tree is ordered by finish time, but an extra key is kept
  5014. + * per each node, containing the minimum value for the start times of
  5015. + * its children (and the node itself), so it's possible to search for
  5016. + * the eligible node with the lowest finish time in logarithmic time.
  5017. + */
  5018. +static void bfq_active_insert(struct bfq_service_tree *st,
  5019. + struct bfq_entity *entity)
  5020. +{
  5021. + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
  5022. + struct rb_node *node = &entity->rb_node;
  5023. +#ifdef CONFIG_CGROUP_BFQIO
  5024. + struct bfq_sched_data *sd = NULL;
  5025. + struct bfq_group *bfqg = NULL;
  5026. + struct bfq_data *bfqd = NULL;
  5027. +#endif
  5028. +
  5029. + bfq_insert(&st->active, entity);
  5030. +
  5031. + if (node->rb_left != NULL)
  5032. + node = node->rb_left;
  5033. + else if (node->rb_right != NULL)
  5034. + node = node->rb_right;
  5035. +
  5036. + bfq_update_active_tree(node);
  5037. +
  5038. +#ifdef CONFIG_CGROUP_BFQIO
  5039. + sd = entity->sched_data;
  5040. + bfqg = container_of(sd, struct bfq_group, sched_data);
  5041. + BUG_ON(!bfqg);
  5042. + bfqd = (struct bfq_data *)bfqg->bfqd;
  5043. +#endif
  5044. + if (bfqq != NULL)
  5045. + list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
  5046. +#ifdef CONFIG_CGROUP_BFQIO
  5047. + else { /* bfq_group */
  5048. + BUG_ON(!bfqd);
  5049. + bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
  5050. + }
  5051. + if (bfqg != bfqd->root_group) {
  5052. + BUG_ON(!bfqg);
  5053. + BUG_ON(!bfqd);
  5054. + bfqg->active_entities++;
  5055. + if (bfqg->active_entities == 2)
  5056. + bfqd->active_numerous_groups++;
  5057. + }
  5058. +#endif
  5059. +}
  5060. +
  5061. +/**
  5062. + * bfq_ioprio_to_weight - calc a weight from an ioprio.
  5063. + * @ioprio: the ioprio value to convert.
  5064. + */
  5065. +static inline unsigned short bfq_ioprio_to_weight(int ioprio)
  5066. +{
  5067. + BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
  5068. + return IOPRIO_BE_NR - ioprio;
  5069. +}
  5070. +
  5071. +/**
  5072. + * bfq_weight_to_ioprio - calc an ioprio from a weight.
  5073. + * @weight: the weight value to convert.
  5074. + *
  5075. + * To preserve as mush as possible the old only-ioprio user interface,
  5076. + * 0 is used as an escape ioprio value for weights (numerically) equal or
  5077. + * larger than IOPRIO_BE_NR
  5078. + */
  5079. +static inline unsigned short bfq_weight_to_ioprio(int weight)
  5080. +{
  5081. + BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
  5082. + return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
  5083. +}
  5084. +
  5085. +static inline void bfq_get_entity(struct bfq_entity *entity)
  5086. +{
  5087. + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
  5088. +
  5089. + if (bfqq != NULL) {
  5090. + atomic_inc(&bfqq->ref);
  5091. + bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
  5092. + bfqq, atomic_read(&bfqq->ref));
  5093. + }
  5094. +}
  5095. +
  5096. +/**
  5097. + * bfq_find_deepest - find the deepest node that an extraction can modify.
  5098. + * @node: the node being removed.
  5099. + *
  5100. + * Do the first step of an extraction in an rb tree, looking for the
  5101. + * node that will replace @node, and returning the deepest node that
  5102. + * the following modifications to the tree can touch. If @node is the
  5103. + * last node in the tree return %NULL.
  5104. + */
  5105. +static struct rb_node *bfq_find_deepest(struct rb_node *node)
  5106. +{
  5107. + struct rb_node *deepest;
  5108. +
  5109. + if (node->rb_right == NULL && node->rb_left == NULL)
  5110. + deepest = rb_parent(node);
  5111. + else if (node->rb_right == NULL)
  5112. + deepest = node->rb_left;
  5113. + else if (node->rb_left == NULL)
  5114. + deepest = node->rb_right;
  5115. + else {
  5116. + deepest = rb_next(node);
  5117. + if (deepest->rb_right != NULL)
  5118. + deepest = deepest->rb_right;
  5119. + else if (rb_parent(deepest) != node)
  5120. + deepest = rb_parent(deepest);
  5121. + }
  5122. +
  5123. + return deepest;
  5124. +}
  5125. +
  5126. +/**
  5127. + * bfq_active_extract - remove an entity from the active tree.
  5128. + * @st: the service_tree containing the tree.
  5129. + * @entity: the entity being removed.
  5130. + */
  5131. +static void bfq_active_extract(struct bfq_service_tree *st,
  5132. + struct bfq_entity *entity)
  5133. +{
  5134. + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
  5135. + struct rb_node *node;
  5136. +#ifdef CONFIG_CGROUP_BFQIO
  5137. + struct bfq_sched_data *sd = NULL;
  5138. + struct bfq_group *bfqg = NULL;
  5139. + struct bfq_data *bfqd = NULL;
  5140. +#endif
  5141. +
  5142. + node = bfq_find_deepest(&entity->rb_node);
  5143. + bfq_extract(&st->active, entity);
  5144. +
  5145. + if (node != NULL)
  5146. + bfq_update_active_tree(node);
  5147. +
  5148. +#ifdef CONFIG_CGROUP_BFQIO
  5149. + sd = entity->sched_data;
  5150. + bfqg = container_of(sd, struct bfq_group, sched_data);
  5151. + BUG_ON(!bfqg);
  5152. + bfqd = (struct bfq_data *)bfqg->bfqd;
  5153. +#endif
  5154. + if (bfqq != NULL)
  5155. + list_del(&bfqq->bfqq_list);
  5156. +#ifdef CONFIG_CGROUP_BFQIO
  5157. + else { /* bfq_group */
  5158. + BUG_ON(!bfqd);
  5159. + bfq_weights_tree_remove(bfqd, entity,
  5160. + &bfqd->group_weights_tree);
  5161. + }
  5162. + if (bfqg != bfqd->root_group) {
  5163. + BUG_ON(!bfqg);
  5164. + BUG_ON(!bfqd);
  5165. + BUG_ON(!bfqg->active_entities);
  5166. + bfqg->active_entities--;
  5167. + if (bfqg->active_entities == 1) {
  5168. + BUG_ON(!bfqd->active_numerous_groups);
  5169. + bfqd->active_numerous_groups--;
  5170. + }
  5171. + }
  5172. +#endif
  5173. +}
  5174. +
  5175. +/**
  5176. + * bfq_idle_insert - insert an entity into the idle tree.
  5177. + * @st: the service tree containing the tree.
  5178. + * @entity: the entity to insert.
  5179. + */
  5180. +static void bfq_idle_insert(struct bfq_service_tree *st,
  5181. + struct bfq_entity *entity)
  5182. +{
  5183. + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
  5184. + struct bfq_entity *first_idle = st->first_idle;
  5185. + struct bfq_entity *last_idle = st->last_idle;
  5186. +
  5187. + if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
  5188. + st->first_idle = entity;
  5189. + if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
  5190. + st->last_idle = entity;
  5191. +
  5192. + bfq_insert(&st->idle, entity);
  5193. +
  5194. + if (bfqq != NULL)
  5195. + list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
  5196. +}
  5197. +
  5198. +/**
  5199. + * bfq_forget_entity - remove an entity from the wfq trees.
  5200. + * @st: the service tree.
  5201. + * @entity: the entity being removed.
  5202. + *
  5203. + * Update the device status and forget everything about @entity, putting
  5204. + * the device reference to it, if it is a queue. Entities belonging to
  5205. + * groups are not refcounted.
  5206. + */
  5207. +static void bfq_forget_entity(struct bfq_service_tree *st,
  5208. + struct bfq_entity *entity)
  5209. +{
  5210. + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
  5211. + struct bfq_sched_data *sd;
  5212. +
  5213. + BUG_ON(!entity->on_st);
  5214. +
  5215. + entity->on_st = 0;
  5216. + st->wsum -= entity->weight;
  5217. + if (bfqq != NULL) {
  5218. + sd = entity->sched_data;
  5219. + bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
  5220. + bfqq, atomic_read(&bfqq->ref));
  5221. + bfq_put_queue(bfqq);
  5222. + }
  5223. +}
  5224. +
  5225. +/**
  5226. + * bfq_put_idle_entity - release the idle tree ref of an entity.
  5227. + * @st: service tree for the entity.
  5228. + * @entity: the entity being released.
  5229. + */
  5230. +static void bfq_put_idle_entity(struct bfq_service_tree *st,
  5231. + struct bfq_entity *entity)
  5232. +{
  5233. + bfq_idle_extract(st, entity);
  5234. + bfq_forget_entity(st, entity);
  5235. +}
  5236. +
  5237. +/**
  5238. + * bfq_forget_idle - update the idle tree if necessary.
  5239. + * @st: the service tree to act upon.
  5240. + *
  5241. + * To preserve the global O(log N) complexity we only remove one entry here;
  5242. + * as the idle tree will not grow indefinitely this can be done safely.
  5243. + */
  5244. +static void bfq_forget_idle(struct bfq_service_tree *st)
  5245. +{
  5246. + struct bfq_entity *first_idle = st->first_idle;
  5247. + struct bfq_entity *last_idle = st->last_idle;
  5248. +
  5249. + if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
  5250. + !bfq_gt(last_idle->finish, st->vtime)) {
  5251. + /*
  5252. + * Forget the whole idle tree, increasing the vtime past
  5253. + * the last finish time of idle entities.
  5254. + */
  5255. + st->vtime = last_idle->finish;
  5256. + }
  5257. +
  5258. + if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
  5259. + bfq_put_idle_entity(st, first_idle);
  5260. +}
  5261. +
  5262. +static struct bfq_service_tree *
  5263. +__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
  5264. + struct bfq_entity *entity)
  5265. +{
  5266. + struct bfq_service_tree *new_st = old_st;
  5267. +
  5268. + if (entity->ioprio_changed) {
  5269. + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
  5270. + unsigned short prev_weight, new_weight;
  5271. + struct bfq_data *bfqd = NULL;
  5272. + struct rb_root *root;
  5273. +#ifdef CONFIG_CGROUP_BFQIO
  5274. + struct bfq_sched_data *sd;
  5275. + struct bfq_group *bfqg;
  5276. +#endif
  5277. +
  5278. + if (bfqq != NULL)
  5279. + bfqd = bfqq->bfqd;
  5280. +#ifdef CONFIG_CGROUP_BFQIO
  5281. + else {
  5282. + sd = entity->my_sched_data;
  5283. + bfqg = container_of(sd, struct bfq_group, sched_data);
  5284. + BUG_ON(!bfqg);
  5285. + bfqd = (struct bfq_data *)bfqg->bfqd;
  5286. + BUG_ON(!bfqd);
  5287. + }
  5288. +#endif
  5289. +
  5290. + BUG_ON(old_st->wsum < entity->weight);
  5291. + old_st->wsum -= entity->weight;
  5292. +
  5293. + if (entity->new_weight != entity->orig_weight) {
  5294. + entity->orig_weight = entity->new_weight;
  5295. + entity->ioprio =
  5296. + bfq_weight_to_ioprio(entity->orig_weight);
  5297. + } else if (entity->new_ioprio != entity->ioprio) {
  5298. + entity->ioprio = entity->new_ioprio;
  5299. + entity->orig_weight =
  5300. + bfq_ioprio_to_weight(entity->ioprio);
  5301. + } else
  5302. + entity->new_weight = entity->orig_weight =
  5303. + bfq_ioprio_to_weight(entity->ioprio);
  5304. +
  5305. + entity->ioprio_class = entity->new_ioprio_class;
  5306. + entity->ioprio_changed = 0;
  5307. +
  5308. + /*
  5309. + * NOTE: here we may be changing the weight too early,
  5310. + * this will cause unfairness. The correct approach
  5311. + * would have required additional complexity to defer
  5312. + * weight changes to the proper time instants (i.e.,
  5313. + * when entity->finish <= old_st->vtime).
  5314. + */
  5315. + new_st = bfq_entity_service_tree(entity);
  5316. +
  5317. + prev_weight = entity->weight;
  5318. + new_weight = entity->orig_weight *
  5319. + (bfqq != NULL ? bfqq->wr_coeff : 1);
  5320. + /*
  5321. + * If the weight of the entity changes, remove the entity
  5322. + * from its old weight counter (if there is a counter
  5323. + * associated with the entity), and add it to the counter
  5324. + * associated with its new weight.
  5325. + */
  5326. + if (prev_weight != new_weight) {
  5327. + root = bfqq ? &bfqd->queue_weights_tree :
  5328. + &bfqd->group_weights_tree;
  5329. + bfq_weights_tree_remove(bfqd, entity, root);
  5330. + }
  5331. + entity->weight = new_weight;
  5332. + /*
  5333. + * Add the entity to its weights tree only if it is
  5334. + * not associated with a weight-raised queue.
  5335. + */
  5336. + if (prev_weight != new_weight &&
  5337. + (bfqq ? bfqq->wr_coeff == 1 : 1))
  5338. + /* If we get here, root has been initialized. */
  5339. + bfq_weights_tree_add(bfqd, entity, root);
  5340. +
  5341. + new_st->wsum += entity->weight;
  5342. +
  5343. + if (new_st != old_st)
  5344. + entity->start = new_st->vtime;
  5345. + }
  5346. +
  5347. + return new_st;
  5348. +}
  5349. +
  5350. +/**
  5351. + * bfq_bfqq_served - update the scheduler status after selection for
  5352. + * service.
  5353. + * @bfqq: the queue being served.
  5354. + * @served: bytes to transfer.
  5355. + *
  5356. + * NOTE: this can be optimized, as the timestamps of upper level entities
  5357. + * are synchronized every time a new bfqq is selected for service. By now,
  5358. + * we keep it to better check consistency.
  5359. + */
  5360. +static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
  5361. +{
  5362. + struct bfq_entity *entity = &bfqq->entity;
  5363. + struct bfq_service_tree *st;
  5364. +
  5365. + for_each_entity(entity) {
  5366. + st = bfq_entity_service_tree(entity);
  5367. +
  5368. + entity->service += served;
  5369. + BUG_ON(entity->service > entity->budget);
  5370. + BUG_ON(st->wsum == 0);
  5371. +
  5372. + st->vtime += bfq_delta(served, st->wsum);
  5373. + bfq_forget_idle(st);
  5374. + }
  5375. + bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
  5376. +}
  5377. +
  5378. +/**
  5379. + * bfq_bfqq_charge_full_budget - set the service to the entity budget.
  5380. + * @bfqq: the queue that needs a service update.
  5381. + *
  5382. + * When it's not possible to be fair in the service domain, because
  5383. + * a queue is not consuming its budget fast enough (the meaning of
  5384. + * fast depends on the timeout parameter), we charge it a full
  5385. + * budget. In this way we should obtain a sort of time-domain
  5386. + * fairness among all the seeky/slow queues.
  5387. + */
  5388. +static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
  5389. +{
  5390. + struct bfq_entity *entity = &bfqq->entity;
  5391. +
  5392. + bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
  5393. +
  5394. + bfq_bfqq_served(bfqq, entity->budget - entity->service);
  5395. +}
  5396. +
  5397. +/**
  5398. + * __bfq_activate_entity - activate an entity.
  5399. + * @entity: the entity being activated.
  5400. + *
  5401. + * Called whenever an entity is activated, i.e., it is not active and one
  5402. + * of its children receives a new request, or has to be reactivated due to
  5403. + * budget exhaustion. It uses the current budget of the entity (and the
  5404. + * service received if @entity is active) of the queue to calculate its
  5405. + * timestamps.
  5406. + */
  5407. +static void __bfq_activate_entity(struct bfq_entity *entity)
  5408. +{
  5409. + struct bfq_sched_data *sd = entity->sched_data;
  5410. + struct bfq_service_tree *st = bfq_entity_service_tree(entity);
  5411. +
  5412. + if (entity == sd->in_service_entity) {
  5413. + BUG_ON(entity->tree != NULL);
  5414. + /*
  5415. + * If we are requeueing the current entity we have
  5416. + * to take care of not charging to it service it has
  5417. + * not received.
  5418. + */
  5419. + bfq_calc_finish(entity, entity->service);
  5420. + entity->start = entity->finish;
  5421. + sd->in_service_entity = NULL;
  5422. + } else if (entity->tree == &st->active) {
  5423. + /*
  5424. + * Requeueing an entity due to a change of some
  5425. + * next_in_service entity below it. We reuse the
  5426. + * old start time.
  5427. + */
  5428. + bfq_active_extract(st, entity);
  5429. + } else if (entity->tree == &st->idle) {
  5430. + /*
  5431. + * Must be on the idle tree, bfq_idle_extract() will
  5432. + * check for that.
  5433. + */
  5434. + bfq_idle_extract(st, entity);
  5435. + entity->start = bfq_gt(st->vtime, entity->finish) ?
  5436. + st->vtime : entity->finish;
  5437. + } else {
  5438. + /*
  5439. + * The finish time of the entity may be invalid, and
  5440. + * it is in the past for sure, otherwise the queue
  5441. + * would have been on the idle tree.
  5442. + */
  5443. + entity->start = st->vtime;
  5444. + st->wsum += entity->weight;
  5445. + bfq_get_entity(entity);
  5446. +
  5447. + BUG_ON(entity->on_st);
  5448. + entity->on_st = 1;
  5449. + }
  5450. +
  5451. + st = __bfq_entity_update_weight_prio(st, entity);
  5452. + bfq_calc_finish(entity, entity->budget);
  5453. + bfq_active_insert(st, entity);
  5454. +}
  5455. +
  5456. +/**
  5457. + * bfq_activate_entity - activate an entity and its ancestors if necessary.
  5458. + * @entity: the entity to activate.
  5459. + *
  5460. + * Activate @entity and all the entities on the path from it to the root.
  5461. + */
  5462. +static void bfq_activate_entity(struct bfq_entity *entity)
  5463. +{
  5464. + struct bfq_sched_data *sd;
  5465. +
  5466. + for_each_entity(entity) {
  5467. + __bfq_activate_entity(entity);
  5468. +
  5469. + sd = entity->sched_data;
  5470. + if (!bfq_update_next_in_service(sd))
  5471. + /*
  5472. + * No need to propagate the activation to the
  5473. + * upper entities, as they will be updated when
  5474. + * the in-service entity is rescheduled.
  5475. + */
  5476. + break;
  5477. + }
  5478. +}
  5479. +
  5480. +/**
  5481. + * __bfq_deactivate_entity - deactivate an entity from its service tree.
  5482. + * @entity: the entity to deactivate.
  5483. + * @requeue: if false, the entity will not be put into the idle tree.
  5484. + *
  5485. + * Deactivate an entity, independently from its previous state. If the
  5486. + * entity was not on a service tree just return, otherwise if it is on
  5487. + * any scheduler tree, extract it from that tree, and if necessary
  5488. + * and if the caller did not specify @requeue, put it on the idle tree.
  5489. + *
  5490. + * Return %1 if the caller should update the entity hierarchy, i.e.,
  5491. + * if the entity was in service or if it was the next_in_service for
  5492. + * its sched_data; return %0 otherwise.
  5493. + */
  5494. +static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
  5495. +{
  5496. + struct bfq_sched_data *sd = entity->sched_data;
  5497. + struct bfq_service_tree *st = bfq_entity_service_tree(entity);
  5498. + int was_in_service = entity == sd->in_service_entity;
  5499. + int ret = 0;
  5500. +
  5501. + if (!entity->on_st)
  5502. + return 0;
  5503. +
  5504. + BUG_ON(was_in_service && entity->tree != NULL);
  5505. +
  5506. + if (was_in_service) {
  5507. + bfq_calc_finish(entity, entity->service);
  5508. + sd->in_service_entity = NULL;
  5509. + } else if (entity->tree == &st->active)
  5510. + bfq_active_extract(st, entity);
  5511. + else if (entity->tree == &st->idle)
  5512. + bfq_idle_extract(st, entity);
  5513. + else if (entity->tree != NULL)
  5514. + BUG();
  5515. +
  5516. + if (was_in_service || sd->next_in_service == entity)
  5517. + ret = bfq_update_next_in_service(sd);
  5518. +
  5519. + if (!requeue || !bfq_gt(entity->finish, st->vtime))
  5520. + bfq_forget_entity(st, entity);
  5521. + else
  5522. + bfq_idle_insert(st, entity);
  5523. +
  5524. + BUG_ON(sd->in_service_entity == entity);
  5525. + BUG_ON(sd->next_in_service == entity);
  5526. +
  5527. + return ret;
  5528. +}
  5529. +
  5530. +/**
  5531. + * bfq_deactivate_entity - deactivate an entity.
  5532. + * @entity: the entity to deactivate.
  5533. + * @requeue: true if the entity can be put on the idle tree
  5534. + */
  5535. +static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
  5536. +{
  5537. + struct bfq_sched_data *sd;
  5538. + struct bfq_entity *parent;
  5539. +
  5540. + for_each_entity_safe(entity, parent) {
  5541. + sd = entity->sched_data;
  5542. +
  5543. + if (!__bfq_deactivate_entity(entity, requeue))
  5544. + /*
  5545. + * The parent entity is still backlogged, and
  5546. + * we don't need to update it as it is still
  5547. + * in service.
  5548. + */
  5549. + break;
  5550. +
  5551. + if (sd->next_in_service != NULL)
  5552. + /*
  5553. + * The parent entity is still backlogged and
  5554. + * the budgets on the path towards the root
  5555. + * need to be updated.
  5556. + */
  5557. + goto update;
  5558. +
  5559. + /*
  5560. + * If we reach there the parent is no more backlogged and
  5561. + * we want to propagate the dequeue upwards.
  5562. + */
  5563. + requeue = 1;
  5564. + }
  5565. +
  5566. + return;
  5567. +
  5568. +update:
  5569. + entity = parent;
  5570. + for_each_entity(entity) {
  5571. + __bfq_activate_entity(entity);
  5572. +
  5573. + sd = entity->sched_data;
  5574. + if (!bfq_update_next_in_service(sd))
  5575. + break;
  5576. + }
  5577. +}
  5578. +
  5579. +/**
  5580. + * bfq_update_vtime - update vtime if necessary.
  5581. + * @st: the service tree to act upon.
  5582. + *
  5583. + * If necessary update the service tree vtime to have at least one
  5584. + * eligible entity, skipping to its start time. Assumes that the
  5585. + * active tree of the device is not empty.
  5586. + *
  5587. + * NOTE: this hierarchical implementation updates vtimes quite often,
  5588. + * we may end up with reactivated processes getting timestamps after a
  5589. + * vtime skip done because we needed a ->first_active entity on some
  5590. + * intermediate node.
  5591. + */
  5592. +static void bfq_update_vtime(struct bfq_service_tree *st)
  5593. +{
  5594. + struct bfq_entity *entry;
  5595. + struct rb_node *node = st->active.rb_node;
  5596. +
  5597. + entry = rb_entry(node, struct bfq_entity, rb_node);
  5598. + if (bfq_gt(entry->min_start, st->vtime)) {
  5599. + st->vtime = entry->min_start;
  5600. + bfq_forget_idle(st);
  5601. + }
  5602. +}
  5603. +
  5604. +/**
  5605. + * bfq_first_active_entity - find the eligible entity with
  5606. + * the smallest finish time
  5607. + * @st: the service tree to select from.
  5608. + *
  5609. + * This function searches the first schedulable entity, starting from the
  5610. + * root of the tree and going on the left every time on this side there is
  5611. + * a subtree with at least one eligible (start >= vtime) entity. The path on
  5612. + * the right is followed only if a) the left subtree contains no eligible
  5613. + * entities and b) no eligible entity has been found yet.
  5614. + */
  5615. +static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
  5616. +{
  5617. + struct bfq_entity *entry, *first = NULL;
  5618. + struct rb_node *node = st->active.rb_node;
  5619. +
  5620. + while (node != NULL) {
  5621. + entry = rb_entry(node, struct bfq_entity, rb_node);
  5622. +left:
  5623. + if (!bfq_gt(entry->start, st->vtime))
  5624. + first = entry;
  5625. +
  5626. + BUG_ON(bfq_gt(entry->min_start, st->vtime));
  5627. +
  5628. + if (node->rb_left != NULL) {
  5629. + entry = rb_entry(node->rb_left,
  5630. + struct bfq_entity, rb_node);
  5631. + if (!bfq_gt(entry->min_start, st->vtime)) {
  5632. + node = node->rb_left;
  5633. + goto left;
  5634. + }
  5635. + }
  5636. + if (first != NULL)
  5637. + break;
  5638. + node = node->rb_right;
  5639. + }
  5640. +
  5641. + BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
  5642. + return first;
  5643. +}
  5644. +
  5645. +/**
  5646. + * __bfq_lookup_next_entity - return the first eligible entity in @st.
  5647. + * @st: the service tree.
  5648. + *
  5649. + * Update the virtual time in @st and return the first eligible entity
  5650. + * it contains.
  5651. + */
  5652. +static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
  5653. + bool force)
  5654. +{
  5655. + struct bfq_entity *entity, *new_next_in_service = NULL;
  5656. +
  5657. + if (RB_EMPTY_ROOT(&st->active))
  5658. + return NULL;
  5659. +
  5660. + bfq_update_vtime(st);
  5661. + entity = bfq_first_active_entity(st);
  5662. + BUG_ON(bfq_gt(entity->start, st->vtime));
  5663. +
  5664. + /*
  5665. + * If the chosen entity does not match with the sched_data's
  5666. + * next_in_service and we are forcedly serving the IDLE priority
  5667. + * class tree, bubble up budget update.
  5668. + */
  5669. + if (unlikely(force && entity != entity->sched_data->next_in_service)) {
  5670. + new_next_in_service = entity;
  5671. + for_each_entity(new_next_in_service)
  5672. + bfq_update_budget(new_next_in_service);
  5673. + }
  5674. +
  5675. + return entity;
  5676. +}
  5677. +
  5678. +/**
  5679. + * bfq_lookup_next_entity - return the first eligible entity in @sd.
  5680. + * @sd: the sched_data.
  5681. + * @extract: if true the returned entity will be also extracted from @sd.
  5682. + *
  5683. + * NOTE: since we cache the next_in_service entity at each level of the
  5684. + * hierarchy, the complexity of the lookup can be decreased with
  5685. + * absolutely no effort just returning the cached next_in_service value;
  5686. + * we prefer to do full lookups to test the consistency of * the data
  5687. + * structures.
  5688. + */
  5689. +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
  5690. + int extract,
  5691. + struct bfq_data *bfqd)
  5692. +{
  5693. + struct bfq_service_tree *st = sd->service_tree;
  5694. + struct bfq_entity *entity;
  5695. + int i = 0;
  5696. +
  5697. + BUG_ON(sd->in_service_entity != NULL);
  5698. +
  5699. + if (bfqd != NULL &&
  5700. + jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
  5701. + entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
  5702. + true);
  5703. + if (entity != NULL) {
  5704. + i = BFQ_IOPRIO_CLASSES - 1;
  5705. + bfqd->bfq_class_idle_last_service = jiffies;
  5706. + sd->next_in_service = entity;
  5707. + }
  5708. + }
  5709. + for (; i < BFQ_IOPRIO_CLASSES; i++) {
  5710. + entity = __bfq_lookup_next_entity(st + i, false);
  5711. + if (entity != NULL) {
  5712. + if (extract) {
  5713. + bfq_check_next_in_service(sd, entity);
  5714. + bfq_active_extract(st + i, entity);
  5715. + sd->in_service_entity = entity;
  5716. + sd->next_in_service = NULL;
  5717. + }
  5718. + break;
  5719. + }
  5720. + }
  5721. +
  5722. + return entity;
  5723. +}
  5724. +
  5725. +/*
  5726. + * Get next queue for service.
  5727. + */
  5728. +static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
  5729. +{
  5730. + struct bfq_entity *entity = NULL;
  5731. + struct bfq_sched_data *sd;
  5732. + struct bfq_queue *bfqq;
  5733. +
  5734. + BUG_ON(bfqd->in_service_queue != NULL);
  5735. +
  5736. + if (bfqd->busy_queues == 0)
  5737. + return NULL;
  5738. +
  5739. + sd = &bfqd->root_group->sched_data;
  5740. + for (; sd != NULL; sd = entity->my_sched_data) {
  5741. + entity = bfq_lookup_next_entity(sd, 1, bfqd);
  5742. + BUG_ON(entity == NULL);
  5743. + entity->service = 0;
  5744. + }
  5745. +
  5746. + bfqq = bfq_entity_to_bfqq(entity);
  5747. + BUG_ON(bfqq == NULL);
  5748. +
  5749. + return bfqq;
  5750. +}
  5751. +
  5752. +/*
  5753. + * Forced extraction of the given queue.
  5754. + */
  5755. +static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
  5756. + struct bfq_queue *bfqq)
  5757. +{
  5758. + struct bfq_entity *entity;
  5759. + struct bfq_sched_data *sd;
  5760. +
  5761. + BUG_ON(bfqd->in_service_queue != NULL);
  5762. +
  5763. + entity = &bfqq->entity;
  5764. + /*
  5765. + * Bubble up extraction/update from the leaf to the root.
  5766. + */
  5767. + for_each_entity(entity) {
  5768. + sd = entity->sched_data;
  5769. + bfq_update_budget(entity);
  5770. + bfq_update_vtime(bfq_entity_service_tree(entity));
  5771. + bfq_active_extract(bfq_entity_service_tree(entity), entity);
  5772. + sd->in_service_entity = entity;
  5773. + sd->next_in_service = NULL;
  5774. + entity->service = 0;
  5775. + }
  5776. +
  5777. + return;
  5778. +}
  5779. +
  5780. +static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
  5781. +{
  5782. + if (bfqd->in_service_bic != NULL) {
  5783. + put_io_context(bfqd->in_service_bic->icq.ioc);
  5784. + bfqd->in_service_bic = NULL;
  5785. + }
  5786. +
  5787. + bfqd->in_service_queue = NULL;
  5788. + del_timer(&bfqd->idle_slice_timer);
  5789. +}
  5790. +
  5791. +static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  5792. + int requeue)
  5793. +{
  5794. + struct bfq_entity *entity = &bfqq->entity;
  5795. +
  5796. + if (bfqq == bfqd->in_service_queue)
  5797. + __bfq_bfqd_reset_in_service(bfqd);
  5798. +
  5799. + bfq_deactivate_entity(entity, requeue);
  5800. +}
  5801. +
  5802. +static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
  5803. +{
  5804. + struct bfq_entity *entity = &bfqq->entity;
  5805. +
  5806. + bfq_activate_entity(entity);
  5807. +}
  5808. +
  5809. +/*
  5810. + * Called when the bfqq no longer has requests pending, remove it from
  5811. + * the service tree.
  5812. + */
  5813. +static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  5814. + int requeue)
  5815. +{
  5816. + BUG_ON(!bfq_bfqq_busy(bfqq));
  5817. + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
  5818. +
  5819. + bfq_log_bfqq(bfqd, bfqq, "del from busy");
  5820. +
  5821. + bfq_clear_bfqq_busy(bfqq);
  5822. +
  5823. + BUG_ON(bfqd->busy_queues == 0);
  5824. + bfqd->busy_queues--;
  5825. +
  5826. + if (!bfqq->dispatched) {
  5827. + bfq_weights_tree_remove(bfqd, &bfqq->entity,
  5828. + &bfqd->queue_weights_tree);
  5829. + if (!blk_queue_nonrot(bfqd->queue)) {
  5830. + BUG_ON(!bfqd->busy_in_flight_queues);
  5831. + bfqd->busy_in_flight_queues--;
  5832. + if (bfq_bfqq_constantly_seeky(bfqq)) {
  5833. + BUG_ON(!bfqd->
  5834. + const_seeky_busy_in_flight_queues);
  5835. + bfqd->const_seeky_busy_in_flight_queues--;
  5836. + }
  5837. + }
  5838. + }
  5839. + if (bfqq->wr_coeff > 1)
  5840. + bfqd->wr_busy_queues--;
  5841. +
  5842. + bfq_deactivate_bfqq(bfqd, bfqq, requeue);
  5843. +}
  5844. +
  5845. +/*
  5846. + * Called when an inactive queue receives a new request.
  5847. + */
  5848. +static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
  5849. +{
  5850. + BUG_ON(bfq_bfqq_busy(bfqq));
  5851. + BUG_ON(bfqq == bfqd->in_service_queue);
  5852. +
  5853. + bfq_log_bfqq(bfqd, bfqq, "add to busy");
  5854. +
  5855. + bfq_activate_bfqq(bfqd, bfqq);
  5856. +
  5857. + bfq_mark_bfqq_busy(bfqq);
  5858. + bfqd->busy_queues++;
  5859. +
  5860. + if (!bfqq->dispatched) {
  5861. + if (bfqq->wr_coeff == 1)
  5862. + bfq_weights_tree_add(bfqd, &bfqq->entity,
  5863. + &bfqd->queue_weights_tree);
  5864. + if (!blk_queue_nonrot(bfqd->queue)) {
  5865. + bfqd->busy_in_flight_queues++;
  5866. + if (bfq_bfqq_constantly_seeky(bfqq))
  5867. + bfqd->const_seeky_busy_in_flight_queues++;
  5868. + }
  5869. + }
  5870. + if (bfqq->wr_coeff > 1)
  5871. + bfqd->wr_busy_queues++;
  5872. +}
  5873. diff --git a/block/bfq.h b/block/bfq.h
  5874. new file mode 100644
  5875. index 0000000..a83e69d
  5876. --- /dev/null
  5877. +++ b/block/bfq.h
  5878. @@ -0,0 +1,742 @@
  5879. +/*
  5880. + * BFQ-v7r5 for 3.16.0: data structures and common functions prototypes.
  5881. + *
  5882. + * Based on ideas and code from CFQ:
  5883. + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  5884. + *
  5885. + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  5886. + * Paolo Valente <paolo.valente@unimore.it>
  5887. + *
  5888. + * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
  5889. + */
  5890. +
  5891. +#ifndef _BFQ_H
  5892. +#define _BFQ_H
  5893. +
  5894. +#include <linux/blktrace_api.h>
  5895. +#include <linux/hrtimer.h>
  5896. +#include <linux/ioprio.h>
  5897. +#include <linux/rbtree.h>
  5898. +
  5899. +#define BFQ_IOPRIO_CLASSES 3
  5900. +#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
  5901. +
  5902. +#define BFQ_MIN_WEIGHT 1
  5903. +#define BFQ_MAX_WEIGHT 1000
  5904. +
  5905. +#define BFQ_DEFAULT_GRP_WEIGHT 10
  5906. +#define BFQ_DEFAULT_GRP_IOPRIO 0
  5907. +#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
  5908. +
  5909. +struct bfq_entity;
  5910. +
  5911. +/**
  5912. + * struct bfq_service_tree - per ioprio_class service tree.
  5913. + * @active: tree for active entities (i.e., those backlogged).
  5914. + * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
  5915. + * @first_idle: idle entity with minimum F_i.
  5916. + * @last_idle: idle entity with maximum F_i.
  5917. + * @vtime: scheduler virtual time.
  5918. + * @wsum: scheduler weight sum; active and idle entities contribute to it.
  5919. + *
  5920. + * Each service tree represents a B-WF2Q+ scheduler on its own. Each
  5921. + * ioprio_class has its own independent scheduler, and so its own
  5922. + * bfq_service_tree. All the fields are protected by the queue lock
  5923. + * of the containing bfqd.
  5924. + */
  5925. +struct bfq_service_tree {
  5926. + struct rb_root active;
  5927. + struct rb_root idle;
  5928. +
  5929. + struct bfq_entity *first_idle;
  5930. + struct bfq_entity *last_idle;
  5931. +
  5932. + u64 vtime;
  5933. + unsigned long wsum;
  5934. +};
  5935. +
  5936. +/**
  5937. + * struct bfq_sched_data - multi-class scheduler.
  5938. + * @in_service_entity: entity in service.
  5939. + * @next_in_service: head-of-the-line entity in the scheduler.
  5940. + * @service_tree: array of service trees, one per ioprio_class.
  5941. + *
  5942. + * bfq_sched_data is the basic scheduler queue. It supports three
  5943. + * ioprio_classes, and can be used either as a toplevel queue or as
  5944. + * an intermediate queue on a hierarchical setup.
  5945. + * @next_in_service points to the active entity of the sched_data
  5946. + * service trees that will be scheduled next.
  5947. + *
  5948. + * The supported ioprio_classes are the same as in CFQ, in descending
  5949. + * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
  5950. + * Requests from higher priority queues are served before all the
  5951. + * requests from lower priority queues; among requests of the same
  5952. + * queue requests are served according to B-WF2Q+.
  5953. + * All the fields are protected by the queue lock of the containing bfqd.
  5954. + */
  5955. +struct bfq_sched_data {
  5956. + struct bfq_entity *in_service_entity;
  5957. + struct bfq_entity *next_in_service;
  5958. + struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
  5959. +};
  5960. +
  5961. +/**
  5962. + * struct bfq_weight_counter - counter of the number of all active entities
  5963. + * with a given weight.
  5964. + * @weight: weight of the entities that this counter refers to.
  5965. + * @num_active: number of active entities with this weight.
  5966. + * @weights_node: weights tree member (see bfq_data's @queue_weights_tree
  5967. + * and @group_weights_tree).
  5968. + */
  5969. +struct bfq_weight_counter {
  5970. + short int weight;
  5971. + unsigned int num_active;
  5972. + struct rb_node weights_node;
  5973. +};
  5974. +
  5975. +/**
  5976. + * struct bfq_entity - schedulable entity.
  5977. + * @rb_node: service_tree member.
  5978. + * @weight_counter: pointer to the weight counter associated with this entity.
  5979. + * @on_st: flag, true if the entity is on a tree (either the active or
  5980. + * the idle one of its service_tree).
  5981. + * @finish: B-WF2Q+ finish timestamp (aka F_i).
  5982. + * @start: B-WF2Q+ start timestamp (aka S_i).
  5983. + * @tree: tree the entity is enqueued into; %NULL if not on a tree.
  5984. + * @min_start: minimum start time of the (active) subtree rooted at
  5985. + * this entity; used for O(log N) lookups into active trees.
  5986. + * @service: service received during the last round of service.
  5987. + * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
  5988. + * @weight: weight of the queue
  5989. + * @parent: parent entity, for hierarchical scheduling.
  5990. + * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
  5991. + * associated scheduler queue, %NULL on leaf nodes.
  5992. + * @sched_data: the scheduler queue this entity belongs to.
  5993. + * @ioprio: the ioprio in use.
  5994. + * @new_weight: when a weight change is requested, the new weight value.
  5995. + * @orig_weight: original weight, used to implement weight boosting
  5996. + * @new_ioprio: when an ioprio change is requested, the new ioprio value.
  5997. + * @ioprio_class: the ioprio_class in use.
  5998. + * @new_ioprio_class: when an ioprio_class change is requested, the new
  5999. + * ioprio_class value.
  6000. + * @ioprio_changed: flag, true when the user requested a weight, ioprio or
  6001. + * ioprio_class change.
  6002. + *
  6003. + * A bfq_entity is used to represent either a bfq_queue (leaf node in the
  6004. + * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
  6005. + * entity belongs to the sched_data of the parent group in the cgroup
  6006. + * hierarchy. Non-leaf entities have also their own sched_data, stored
  6007. + * in @my_sched_data.
  6008. + *
  6009. + * Each entity stores independently its priority values; this would
  6010. + * allow different weights on different devices, but this
  6011. + * functionality is not exported to userspace by now. Priorities and
  6012. + * weights are updated lazily, first storing the new values into the
  6013. + * new_* fields, then setting the @ioprio_changed flag. As soon as
  6014. + * there is a transition in the entity state that allows the priority
  6015. + * update to take place the effective and the requested priority
  6016. + * values are synchronized.
  6017. + *
  6018. + * Unless cgroups are used, the weight value is calculated from the
  6019. + * ioprio to export the same interface as CFQ. When dealing with
  6020. + * ``well-behaved'' queues (i.e., queues that do not spend too much
  6021. + * time to consume their budget and have true sequential behavior, and
  6022. + * when there are no external factors breaking anticipation) the
  6023. + * relative weights at each level of the cgroups hierarchy should be
  6024. + * guaranteed. All the fields are protected by the queue lock of the
  6025. + * containing bfqd.
  6026. + */
  6027. +struct bfq_entity {
  6028. + struct rb_node rb_node;
  6029. + struct bfq_weight_counter *weight_counter;
  6030. +
  6031. + int on_st;
  6032. +
  6033. + u64 finish;
  6034. + u64 start;
  6035. +
  6036. + struct rb_root *tree;
  6037. +
  6038. + u64 min_start;
  6039. +
  6040. + unsigned long service, budget;
  6041. + unsigned short weight, new_weight;
  6042. + unsigned short orig_weight;
  6043. +
  6044. + struct bfq_entity *parent;
  6045. +
  6046. + struct bfq_sched_data *my_sched_data;
  6047. + struct bfq_sched_data *sched_data;
  6048. +
  6049. + unsigned short ioprio, new_ioprio;
  6050. + unsigned short ioprio_class, new_ioprio_class;
  6051. +
  6052. + int ioprio_changed;
  6053. +};
  6054. +
  6055. +struct bfq_group;
  6056. +
  6057. +/**
  6058. + * struct bfq_queue - leaf schedulable entity.
  6059. + * @ref: reference counter.
  6060. + * @bfqd: parent bfq_data.
  6061. + * @new_bfqq: shared bfq_queue if queue is cooperating with
  6062. + * one or more other queues.
  6063. + * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
  6064. + * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
  6065. + * @sort_list: sorted list of pending requests.
  6066. + * @next_rq: if fifo isn't expired, next request to serve.
  6067. + * @queued: nr of requests queued in @sort_list.
  6068. + * @allocated: currently allocated requests.
  6069. + * @meta_pending: pending metadata requests.
  6070. + * @fifo: fifo list of requests in sort_list.
  6071. + * @entity: entity representing this queue in the scheduler.
  6072. + * @max_budget: maximum budget allowed from the feedback mechanism.
  6073. + * @budget_timeout: budget expiration (in jiffies).
  6074. + * @dispatched: number of requests on the dispatch list or inside driver.
  6075. + * @flags: status flags.
  6076. + * @bfqq_list: node for active/idle bfqq list inside our bfqd.
  6077. + * @seek_samples: number of seeks sampled
  6078. + * @seek_total: sum of the distances of the seeks sampled
  6079. + * @seek_mean: mean seek distance
  6080. + * @last_request_pos: position of the last request enqueued
  6081. + * @requests_within_timer: number of consecutive pairs of request completion
  6082. + * and arrival, such that the queue becomes idle
  6083. + * after the completion, but the next request arrives
  6084. + * within an idle time slice; used only if the queue's
  6085. + * IO_bound has been cleared.
  6086. + * @pid: pid of the process owning the queue, used for logging purposes.
  6087. + * @last_wr_start_finish: start time of the current weight-raising period if
  6088. + * the @bfq-queue is being weight-raised, otherwise
  6089. + * finish time of the last weight-raising period
  6090. + * @wr_cur_max_time: current max raising time for this queue
  6091. + * @soft_rt_next_start: minimum time instant such that, only if a new
  6092. + * request is enqueued after this time instant in an
  6093. + * idle @bfq_queue with no outstanding requests, then
  6094. + * the task associated with the queue it is deemed as
  6095. + * soft real-time (see the comments to the function
  6096. + * bfq_bfqq_softrt_next_start()).
  6097. + * @last_idle_bklogged: time of the last transition of the @bfq_queue from
  6098. + * idle to backlogged
  6099. + * @service_from_backlogged: cumulative service received from the @bfq_queue
  6100. + * since the last transition from idle to
  6101. + * backlogged
  6102. + *
  6103. + * A bfq_queue is a leaf request queue; it can be associated with an io_context
  6104. + * or more, if it is async or shared between cooperating processes. @cgroup
  6105. + * holds a reference to the cgroup, to be sure that it does not disappear while
  6106. + * a bfqq still references it (mostly to avoid races between request issuing and
  6107. + * task migration followed by cgroup destruction).
  6108. + * All the fields are protected by the queue lock of the containing bfqd.
  6109. + */
  6110. +struct bfq_queue {
  6111. + atomic_t ref;
  6112. + struct bfq_data *bfqd;
  6113. +
  6114. + /* fields for cooperating queues handling */
  6115. + struct bfq_queue *new_bfqq;
  6116. + struct rb_node pos_node;
  6117. + struct rb_root *pos_root;
  6118. +
  6119. + struct rb_root sort_list;
  6120. + struct request *next_rq;
  6121. + int queued[2];
  6122. + int allocated[2];
  6123. + int meta_pending;
  6124. + struct list_head fifo;
  6125. +
  6126. + struct bfq_entity entity;
  6127. +
  6128. + unsigned long max_budget;
  6129. + unsigned long budget_timeout;
  6130. +
  6131. + int dispatched;
  6132. +
  6133. + unsigned int flags;
  6134. +
  6135. + struct list_head bfqq_list;
  6136. +
  6137. + unsigned int seek_samples;
  6138. + u64 seek_total;
  6139. + sector_t seek_mean;
  6140. + sector_t last_request_pos;
  6141. +
  6142. + unsigned int requests_within_timer;
  6143. +
  6144. + pid_t pid;
  6145. +
  6146. + /* weight-raising fields */
  6147. + unsigned long wr_cur_max_time;
  6148. + unsigned long soft_rt_next_start;
  6149. + unsigned long last_wr_start_finish;
  6150. + unsigned int wr_coeff;
  6151. + unsigned long last_idle_bklogged;
  6152. + unsigned long service_from_backlogged;
  6153. +};
  6154. +
  6155. +/**
  6156. + * struct bfq_ttime - per process thinktime stats.
  6157. + * @ttime_total: total process thinktime
  6158. + * @ttime_samples: number of thinktime samples
  6159. + * @ttime_mean: average process thinktime
  6160. + */
  6161. +struct bfq_ttime {
  6162. + unsigned long last_end_request;
  6163. +
  6164. + unsigned long ttime_total;
  6165. + unsigned long ttime_samples;
  6166. + unsigned long ttime_mean;
  6167. +};
  6168. +
  6169. +/**
  6170. + * struct bfq_io_cq - per (request_queue, io_context) structure.
  6171. + * @icq: associated io_cq structure
  6172. + * @bfqq: array of two process queues, the sync and the async
  6173. + * @ttime: associated @bfq_ttime struct
  6174. + */
  6175. +struct bfq_io_cq {
  6176. + struct io_cq icq; /* must be the first member */
  6177. + struct bfq_queue *bfqq[2];
  6178. + struct bfq_ttime ttime;
  6179. + int ioprio;
  6180. +};
  6181. +
  6182. +enum bfq_device_speed {
  6183. + BFQ_BFQD_FAST,
  6184. + BFQ_BFQD_SLOW,
  6185. +};
  6186. +
  6187. +/**
  6188. + * struct bfq_data - per device data structure.
  6189. + * @queue: request queue for the managed device.
  6190. + * @root_group: root bfq_group for the device.
  6191. + * @rq_pos_tree: rbtree sorted by next_request position, used when
  6192. + * determining if two or more queues have interleaving
  6193. + * requests (see bfq_close_cooperator()).
  6194. + * @active_numerous_groups: number of bfq_groups containing more than one
  6195. + * active @bfq_entity.
  6196. + * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by
  6197. + * weight. Used to keep track of whether all @bfq_queues
  6198. + * have the same weight. The tree contains one counter
  6199. + * for each distinct weight associated to some active
  6200. + * and not weight-raised @bfq_queue (see the comments to
  6201. + * the functions bfq_weights_tree_[add|remove] for
  6202. + * further details).
  6203. + * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted
  6204. + * by weight. Used to keep track of whether all
  6205. + * @bfq_groups have the same weight. The tree contains
  6206. + * one counter for each distinct weight associated to
  6207. + * some active @bfq_group (see the comments to the
  6208. + * functions bfq_weights_tree_[add|remove] for further
  6209. + * details).
  6210. + * @busy_queues: number of bfq_queues containing requests (including the
  6211. + * queue in service, even if it is idling).
  6212. + * @busy_in_flight_queues: number of @bfq_queues containing pending or
  6213. + * in-flight requests, plus the @bfq_queue in
  6214. + * service, even if idle but waiting for the
  6215. + * possible arrival of its next sync request. This
  6216. + * field is updated only if the device is rotational,
  6217. + * but used only if the device is also NCQ-capable.
  6218. + * The reason why the field is updated also for non-
  6219. + * NCQ-capable rotational devices is related to the
  6220. + * fact that the value of @hw_tag may be set also
  6221. + * later than when busy_in_flight_queues may need to
  6222. + * be incremented for the first time(s). Taking also
  6223. + * this possibility into account, to avoid unbalanced
  6224. + * increments/decrements, would imply more overhead
  6225. + * than just updating busy_in_flight_queues
  6226. + * regardless of the value of @hw_tag.
  6227. + * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues
  6228. + * (that is, seeky queues that expired
  6229. + * for budget timeout at least once)
  6230. + * containing pending or in-flight
  6231. + * requests, including the in-service
  6232. + * @bfq_queue if constantly seeky. This
  6233. + * field is updated only if the device
  6234. + * is rotational, but used only if the
  6235. + * device is also NCQ-capable (see the
  6236. + * comments to @busy_in_flight_queues).
  6237. + * @wr_busy_queues: number of weight-raised busy @bfq_queues.
  6238. + * @queued: number of queued requests.
  6239. + * @rq_in_driver: number of requests dispatched and waiting for completion.
  6240. + * @sync_flight: number of sync requests in the driver.
  6241. + * @max_rq_in_driver: max number of reqs in driver in the last
  6242. + * @hw_tag_samples completed requests.
  6243. + * @hw_tag_samples: nr of samples used to calculate hw_tag.
  6244. + * @hw_tag: flag set to one if the driver is showing a queueing behavior.
  6245. + * @budgets_assigned: number of budgets assigned.
  6246. + * @idle_slice_timer: timer set when idling for the next sequential request
  6247. + * from the queue in service.
  6248. + * @unplug_work: delayed work to restart dispatching on the request queue.
  6249. + * @in_service_queue: bfq_queue in service.
  6250. + * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.
  6251. + * @last_position: on-disk position of the last served request.
  6252. + * @last_budget_start: beginning of the last budget.
  6253. + * @last_idling_start: beginning of the last idle slice.
  6254. + * @peak_rate: peak transfer rate observed for a budget.
  6255. + * @peak_rate_samples: number of samples used to calculate @peak_rate.
  6256. + * @bfq_max_budget: maximum budget allotted to a bfq_queue before
  6257. + * rescheduling.
  6258. + * @group_list: list of all the bfq_groups active on the device.
  6259. + * @active_list: list of all the bfq_queues active on the device.
  6260. + * @idle_list: list of all the bfq_queues idle on the device.
  6261. + * @bfq_quantum: max number of requests dispatched per dispatch round.
  6262. + * @bfq_fifo_expire: timeout for async/sync requests; when it expires
  6263. + * requests are served in fifo order.
  6264. + * @bfq_back_penalty: weight of backward seeks wrt forward ones.
  6265. + * @bfq_back_max: maximum allowed backward seek.
  6266. + * @bfq_slice_idle: maximum idling time.
  6267. + * @bfq_user_max_budget: user-configured max budget value
  6268. + * (0 for auto-tuning).
  6269. + * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
  6270. + * async queues.
  6271. + * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
  6272. + * to prevent seeky queues to impose long latencies to well
  6273. + * behaved ones (this also implies that seeky queues cannot
  6274. + * receive guarantees in the service domain; after a timeout
  6275. + * they are charged for the whole allocated budget, to try
  6276. + * to preserve a behavior reasonably fair among them, but
  6277. + * without service-domain guarantees).
  6278. + * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is
  6279. + * no more granted any weight-raising.
  6280. + * @bfq_failed_cooperations: number of consecutive failed cooperation
  6281. + * chances after which weight-raising is restored
  6282. + * to a queue subject to more than bfq_coop_thresh
  6283. + * queue merges.
  6284. + * @bfq_requests_within_timer: number of consecutive requests that must be
  6285. + * issued within the idle time slice to set
  6286. + * again idling to a queue which was marked as
  6287. + * non-I/O-bound (see the definition of the
  6288. + * IO_bound flag for further details).
  6289. + * @bfq_wr_coeff: Maximum factor by which the weight of a weight-raised
  6290. + * queue is multiplied
  6291. + * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies)
  6292. + * @bfq_wr_rt_max_time: maximum duration for soft real-time processes
  6293. + * @bfq_wr_min_idle_time: minimum idle period after which weight-raising
  6294. + * may be reactivated for a queue (in jiffies)
  6295. + * @bfq_wr_min_inter_arr_async: minimum period between request arrivals
  6296. + * after which weight-raising may be
  6297. + * reactivated for an already busy queue
  6298. + * (in jiffies)
  6299. + * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue,
  6300. + * sectors per seconds
  6301. + * @RT_prod: cached value of the product R*T used for computing the maximum
  6302. + * duration of the weight raising automatically
  6303. + * @device_speed: device-speed class for the low-latency heuristic
  6304. + * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
  6305. + *
  6306. + * All the fields are protected by the @queue lock.
  6307. + */
  6308. +struct bfq_data {
  6309. + struct request_queue *queue;
  6310. +
  6311. + struct bfq_group *root_group;
  6312. + struct rb_root rq_pos_tree;
  6313. +
  6314. +#ifdef CONFIG_CGROUP_BFQIO
  6315. + int active_numerous_groups;
  6316. +#endif
  6317. +
  6318. + struct rb_root queue_weights_tree;
  6319. + struct rb_root group_weights_tree;
  6320. +
  6321. + int busy_queues;
  6322. + int busy_in_flight_queues;
  6323. + int const_seeky_busy_in_flight_queues;
  6324. + int wr_busy_queues;
  6325. + int queued;
  6326. + int rq_in_driver;
  6327. + int sync_flight;
  6328. +
  6329. + int max_rq_in_driver;
  6330. + int hw_tag_samples;
  6331. + int hw_tag;
  6332. +
  6333. + int budgets_assigned;
  6334. +
  6335. + struct timer_list idle_slice_timer;
  6336. + struct work_struct unplug_work;
  6337. +
  6338. + struct bfq_queue *in_service_queue;
  6339. + struct bfq_io_cq *in_service_bic;
  6340. +
  6341. + sector_t last_position;
  6342. +
  6343. + ktime_t last_budget_start;
  6344. + ktime_t last_idling_start;
  6345. + int peak_rate_samples;
  6346. + u64 peak_rate;
  6347. + unsigned long bfq_max_budget;
  6348. +
  6349. + struct hlist_head group_list;
  6350. + struct list_head active_list;
  6351. + struct list_head idle_list;
  6352. +
  6353. + unsigned int bfq_quantum;
  6354. + unsigned int bfq_fifo_expire[2];
  6355. + unsigned int bfq_back_penalty;
  6356. + unsigned int bfq_back_max;
  6357. + unsigned int bfq_slice_idle;
  6358. + u64 bfq_class_idle_last_service;
  6359. +
  6360. + unsigned int bfq_user_max_budget;
  6361. + unsigned int bfq_max_budget_async_rq;
  6362. + unsigned int bfq_timeout[2];
  6363. +
  6364. + unsigned int bfq_coop_thresh;
  6365. + unsigned int bfq_failed_cooperations;
  6366. + unsigned int bfq_requests_within_timer;
  6367. +
  6368. + bool low_latency;
  6369. +
  6370. + /* parameters of the low_latency heuristics */
  6371. + unsigned int bfq_wr_coeff;
  6372. + unsigned int bfq_wr_max_time;
  6373. + unsigned int bfq_wr_rt_max_time;
  6374. + unsigned int bfq_wr_min_idle_time;
  6375. + unsigned long bfq_wr_min_inter_arr_async;
  6376. + unsigned int bfq_wr_max_softrt_rate;
  6377. + u64 RT_prod;
  6378. + enum bfq_device_speed device_speed;
  6379. +
  6380. + struct bfq_queue oom_bfqq;
  6381. +};
  6382. +
  6383. +enum bfqq_state_flags {
  6384. + BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */
  6385. + BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
  6386. + BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
  6387. + BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
  6388. + BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
  6389. + BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */
  6390. + BFQ_BFQQ_FLAG_sync, /* synchronous queue */
  6391. + BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
  6392. + BFQ_BFQQ_FLAG_IO_bound, /*
  6393. + * bfqq has timed-out at least once
  6394. + * having consumed at most 2/10 of
  6395. + * its budget
  6396. + */
  6397. + BFQ_BFQQ_FLAG_constantly_seeky, /*
  6398. + * bfqq has proved to be slow and
  6399. + * seeky until budget timeout
  6400. + */
  6401. + BFQ_BFQQ_FLAG_softrt_update, /*
  6402. + * may need softrt-next-start
  6403. + * update
  6404. + */
  6405. + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
  6406. + BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
  6407. +};
  6408. +
  6409. +#define BFQ_BFQQ_FNS(name) \
  6410. +static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
  6411. +{ \
  6412. + (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
  6413. +} \
  6414. +static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
  6415. +{ \
  6416. + (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
  6417. +} \
  6418. +static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
  6419. +{ \
  6420. + return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
  6421. +}
  6422. +
  6423. +BFQ_BFQQ_FNS(busy);
  6424. +BFQ_BFQQ_FNS(wait_request);
  6425. +BFQ_BFQQ_FNS(must_alloc);
  6426. +BFQ_BFQQ_FNS(fifo_expire);
  6427. +BFQ_BFQQ_FNS(idle_window);
  6428. +BFQ_BFQQ_FNS(prio_changed);
  6429. +BFQ_BFQQ_FNS(sync);
  6430. +BFQ_BFQQ_FNS(budget_new);
  6431. +BFQ_BFQQ_FNS(IO_bound);
  6432. +BFQ_BFQQ_FNS(constantly_seeky);
  6433. +BFQ_BFQQ_FNS(coop);
  6434. +BFQ_BFQQ_FNS(split_coop);
  6435. +BFQ_BFQQ_FNS(softrt_update);
  6436. +#undef BFQ_BFQQ_FNS
  6437. +
  6438. +/* Logging facilities. */
  6439. +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
  6440. + blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
  6441. +
  6442. +#define bfq_log(bfqd, fmt, args...) \
  6443. + blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
  6444. +
  6445. +/* Expiration reasons. */
  6446. +enum bfqq_expiration {
  6447. + BFQ_BFQQ_TOO_IDLE = 0, /*
  6448. + * queue has been idling for
  6449. + * too long
  6450. + */
  6451. + BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
  6452. + BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
  6453. + BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
  6454. +};
  6455. +
  6456. +#ifdef CONFIG_CGROUP_BFQIO
  6457. +/**
  6458. + * struct bfq_group - per (device, cgroup) data structure.
  6459. + * @entity: schedulable entity to insert into the parent group sched_data.
  6460. + * @sched_data: own sched_data, to contain child entities (they may be
  6461. + * both bfq_queues and bfq_groups).
  6462. + * @group_node: node to be inserted into the bfqio_cgroup->group_data
  6463. + * list of the containing cgroup's bfqio_cgroup.
  6464. + * @bfqd_node: node to be inserted into the @bfqd->group_list list
  6465. + * of the groups active on the same device; used for cleanup.
  6466. + * @bfqd: the bfq_data for the device this group acts upon.
  6467. + * @async_bfqq: array of async queues for all the tasks belonging to
  6468. + * the group, one queue per ioprio value per ioprio_class,
  6469. + * except for the idle class that has only one queue.
  6470. + * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
  6471. + * @my_entity: pointer to @entity, %NULL for the toplevel group; used
  6472. + * to avoid too many special cases during group creation/
  6473. + * migration.
  6474. + * @active_entities: number of active entities belonging to the group;
  6475. + * unused for the root group. Used to know whether there
  6476. + * are groups with more than one active @bfq_entity
  6477. + * (see the comments to the function
  6478. + * bfq_bfqq_must_not_expire()).
  6479. + *
  6480. + * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
  6481. + * there is a set of bfq_groups, each one collecting the lower-level
  6482. + * entities belonging to the group that are acting on the same device.
  6483. + *
  6484. + * Locking works as follows:
  6485. + * o @group_node is protected by the bfqio_cgroup lock, and is accessed
  6486. + * via RCU from its readers.
  6487. + * o @bfqd is protected by the queue lock, RCU is used to access it
  6488. + * from the readers.
  6489. + * o All the other fields are protected by the @bfqd queue lock.
  6490. + */
  6491. +struct bfq_group {
  6492. + struct bfq_entity entity;
  6493. + struct bfq_sched_data sched_data;
  6494. +
  6495. + struct hlist_node group_node;
  6496. + struct hlist_node bfqd_node;
  6497. +
  6498. + void *bfqd;
  6499. +
  6500. + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
  6501. + struct bfq_queue *async_idle_bfqq;
  6502. +
  6503. + struct bfq_entity *my_entity;
  6504. +
  6505. + int active_entities;
  6506. +};
  6507. +
  6508. +/**
  6509. + * struct bfqio_cgroup - bfq cgroup data structure.
  6510. + * @css: subsystem state for bfq in the containing cgroup.
  6511. + * @online: flag marked when the subsystem is inserted.
  6512. + * @weight: cgroup weight.
  6513. + * @ioprio: cgroup ioprio.
  6514. + * @ioprio_class: cgroup ioprio_class.
  6515. + * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
  6516. + * @group_data: list containing the bfq_group belonging to this cgroup.
  6517. + *
  6518. + * @group_data is accessed using RCU, with @lock protecting the updates,
  6519. + * @ioprio and @ioprio_class are protected by @lock.
  6520. + */
  6521. +struct bfqio_cgroup {
  6522. + struct cgroup_subsys_state css;
  6523. + bool online;
  6524. +
  6525. + unsigned short weight, ioprio, ioprio_class;
  6526. +
  6527. + spinlock_t lock;
  6528. + struct hlist_head group_data;
  6529. +};
  6530. +#else
  6531. +struct bfq_group {
  6532. + struct bfq_sched_data sched_data;
  6533. +
  6534. + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
  6535. + struct bfq_queue *async_idle_bfqq;
  6536. +};
  6537. +#endif
  6538. +
  6539. +static inline struct bfq_service_tree *
  6540. +bfq_entity_service_tree(struct bfq_entity *entity)
  6541. +{
  6542. + struct bfq_sched_data *sched_data = entity->sched_data;
  6543. + unsigned int idx = entity->ioprio_class - 1;
  6544. +
  6545. + BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
  6546. + BUG_ON(sched_data == NULL);
  6547. +
  6548. + return sched_data->service_tree + idx;
  6549. +}
  6550. +
  6551. +static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
  6552. + int is_sync)
  6553. +{
  6554. + return bic->bfqq[!!is_sync];
  6555. +}
  6556. +
  6557. +static inline void bic_set_bfqq(struct bfq_io_cq *bic,
  6558. + struct bfq_queue *bfqq, int is_sync)
  6559. +{
  6560. + bic->bfqq[!!is_sync] = bfqq;
  6561. +}
  6562. +
  6563. +static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
  6564. +{
  6565. + return bic->icq.q->elevator->elevator_data;
  6566. +}
  6567. +
  6568. +/**
  6569. + * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
  6570. + * @ptr: a pointer to a bfqd.
  6571. + * @flags: storage for the flags to be saved.
  6572. + *
  6573. + * This function allows bfqg->bfqd to be protected by the
  6574. + * queue lock of the bfqd they reference; the pointer is dereferenced
  6575. + * under RCU, so the storage for bfqd is assured to be safe as long
  6576. + * as the RCU read side critical section does not end. After the
  6577. + * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
  6578. + * sure that no other writer accessed it. If we raced with a writer,
  6579. + * the function returns NULL, with the queue unlocked, otherwise it
  6580. + * returns the dereferenced pointer, with the queue locked.
  6581. + */
  6582. +static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
  6583. + unsigned long *flags)
  6584. +{
  6585. + struct bfq_data *bfqd;
  6586. +
  6587. + rcu_read_lock();
  6588. + bfqd = rcu_dereference(*(struct bfq_data **)ptr);
  6589. +
  6590. + if (bfqd != NULL) {
  6591. + spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
  6592. + if (*ptr == bfqd)
  6593. + goto out;
  6594. + spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
  6595. + }
  6596. +
  6597. + bfqd = NULL;
  6598. +out:
  6599. + rcu_read_unlock();
  6600. + return bfqd;
  6601. +}
  6602. +
  6603. +static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
  6604. + unsigned long *flags)
  6605. +{
  6606. + spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
  6607. +}
  6608. +
  6609. +static void bfq_changed_ioprio(struct bfq_io_cq *bic);
  6610. +static void bfq_put_queue(struct bfq_queue *bfqq);
  6611. +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
  6612. +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
  6613. + struct bfq_group *bfqg, int is_sync,
  6614. + struct bfq_io_cq *bic, gfp_t gfp_mask);
  6615. +static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
  6616. + struct bfq_group *bfqg);
  6617. +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
  6618. +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
  6619. +
  6620. +#endif /* _BFQ_H */
  6621. --
  6622. 2.0.3