lsm_sorted.c 180 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196
  1. /*
  2. ** 2011-08-14
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. *************************************************************************
  12. **
  13. ** PAGE FORMAT:
  14. **
  15. ** The maximum page size is 65536 bytes.
  16. **
  17. ** Since all records are equal to or larger than 2 bytes in size, and
  18. ** some space within the page is consumed by the page footer, there must
  19. ** be less than 2^15 records on each page.
  20. **
  21. ** Each page ends with a footer that describes the pages contents. This
  22. ** footer serves as similar purpose to the page header in an SQLite database.
  23. ** A footer is used instead of a header because it makes it easier to
  24. ** populate a new page based on a sorted list of key/value pairs.
  25. **
  26. ** The footer consists of the following values (starting at the end of
  27. ** the page and continuing backwards towards the start). All values are
  28. ** stored as unsigned big-endian integers.
  29. **
  30. ** * Number of records on page (2 bytes).
  31. ** * Flags field (2 bytes).
  32. ** * Left-hand pointer value (8 bytes).
  33. ** * The starting offset of each record (2 bytes per record).
  34. **
  35. ** Records may span pages. Unless it happens to be an exact fit, the part
  36. ** of the final record that starts on page X that does not fit on page X
  37. ** is stored at the start of page (X+1). This means there may be pages where
  38. ** (N==0). And on most pages the first record that starts on the page will
  39. ** not start at byte offset 0. For example:
  40. **
  41. ** aaaaa bbbbb ccc <footer> cc eeeee fffff g <footer> gggg....
  42. **
  43. ** RECORD FORMAT:
  44. **
  45. ** The first byte of the record is a flags byte. It is a combination
  46. ** of the following flags (defined in lsmInt.h):
  47. **
  48. ** LSM_START_DELETE
  49. ** LSM_END_DELETE
  50. ** LSM_POINT_DELETE
  51. ** LSM_INSERT
  52. ** LSM_SEPARATOR
  53. ** LSM_SYSTEMKEY
  54. **
  55. ** Immediately following the type byte is a pointer to the smallest key
  56. ** in the next file that is larger than the key in the current record. The
  57. ** pointer is encoded as a varint. When added to the 32-bit page number
  58. ** stored in the footer, it is the page number of the page that contains the
  59. ** smallest key in the next sorted file that is larger than this key.
  60. **
  61. ** Next is the number of bytes in the key, encoded as a varint.
  62. **
  63. ** If the LSM_INSERT flag is set, the number of bytes in the value, as
  64. ** a varint, is next.
  65. **
  66. ** Finally, the blob of data containing the key, and for LSM_INSERT
  67. ** records, the value as well.
  68. */
  69. #ifndef _LSM_INT_H
  70. # include "lsmInt.h"
  71. #endif
  72. #define LSM_LOG_STRUCTURE 0
  73. #define LSM_LOG_DATA 0
  74. /*
  75. ** Macros to help decode record types.
  76. */
  77. #define rtTopic(eType) ((eType) & LSM_SYSTEMKEY)
  78. #define rtIsDelete(eType) (((eType) & 0x0F)==LSM_POINT_DELETE)
  79. #define rtIsSeparator(eType) (((eType) & LSM_SEPARATOR)!=0)
  80. #define rtIsWrite(eType) (((eType) & LSM_INSERT)!=0)
  81. #define rtIsSystem(eType) (((eType) & LSM_SYSTEMKEY)!=0)
  82. /*
  83. ** The following macros are used to access a page footer.
  84. */
  85. #define SEGMENT_NRECORD_OFFSET(pgsz) ((pgsz) - 2)
  86. #define SEGMENT_FLAGS_OFFSET(pgsz) ((pgsz) - 2 - 2)
  87. #define SEGMENT_POINTER_OFFSET(pgsz) ((pgsz) - 2 - 2 - 8)
  88. #define SEGMENT_CELLPTR_OFFSET(pgsz, iCell) ((pgsz) - 2 - 2 - 8 - 2 - (iCell)*2)
  89. #define SEGMENT_EOF(pgsz, nEntry) SEGMENT_CELLPTR_OFFSET(pgsz, nEntry-1)
  90. #define SEGMENT_BTREE_FLAG 0x0001
  91. #define PGFTR_SKIP_NEXT_FLAG 0x0002
  92. #define PGFTR_SKIP_THIS_FLAG 0x0004
  93. #ifndef LSM_SEGMENTPTR_FREE_THRESHOLD
  94. # define LSM_SEGMENTPTR_FREE_THRESHOLD 1024
  95. #endif
  96. typedef struct SegmentPtr SegmentPtr;
  97. typedef struct LsmBlob LsmBlob;
  98. struct LsmBlob {
  99. lsm_env *pEnv;
  100. void *pData;
  101. int nData;
  102. int nAlloc;
  103. };
  104. /*
  105. ** A SegmentPtr object may be used for one of two purposes:
  106. **
  107. ** * To iterate and/or seek within a single Segment (the combination of a
  108. ** main run and an optional sorted run).
  109. **
  110. ** * To iterate through the separators array of a segment.
  111. */
  112. struct SegmentPtr {
  113. Level *pLevel; /* Level object segment is part of */
  114. Segment *pSeg; /* Segment to access */
  115. /* Current page. See segmentPtrLoadPage(). */
  116. Page *pPg; /* Current page */
  117. u16 flags; /* Copy of page flags field */
  118. int nCell; /* Number of cells on pPg */
  119. LsmPgno iPtr; /* Base cascade pointer */
  120. /* Current cell. See segmentPtrLoadCell() */
  121. int iCell; /* Current record within page pPg */
  122. int eType; /* Type of current record */
  123. LsmPgno iPgPtr; /* Cascade pointer offset */
  124. void *pKey; int nKey; /* Key associated with current record */
  125. void *pVal; int nVal; /* Current record value (eType==WRITE only) */
  126. /* Blobs used to allocate buffers for pKey and pVal as required */
  127. LsmBlob blob1;
  128. LsmBlob blob2;
  129. };
  130. /*
  131. ** Used to iterate through the keys stored in a b-tree hierarchy from start
  132. ** to finish. Only First() and Next() operations are required.
  133. **
  134. ** btreeCursorNew()
  135. ** btreeCursorFirst()
  136. ** btreeCursorNext()
  137. ** btreeCursorFree()
  138. ** btreeCursorPosition()
  139. ** btreeCursorRestore()
  140. */
  141. typedef struct BtreePg BtreePg;
  142. typedef struct BtreeCursor BtreeCursor;
  143. struct BtreePg {
  144. Page *pPage;
  145. int iCell;
  146. };
  147. struct BtreeCursor {
  148. Segment *pSeg; /* Iterate through this segments btree */
  149. FileSystem *pFS; /* File system to read pages from */
  150. int nDepth; /* Allocated size of aPg[] */
  151. int iPg; /* Current entry in aPg[]. -1 -> EOF. */
  152. BtreePg *aPg; /* Pages from root to current location */
  153. /* Cache of current entry. pKey==0 for EOF. */
  154. void *pKey;
  155. int nKey;
  156. int eType;
  157. LsmPgno iPtr;
  158. /* Storage for key, if not local */
  159. LsmBlob blob;
  160. };
  161. /*
  162. ** A cursor used for merged searches or iterations through up to one
  163. ** Tree structure and any number of sorted files.
  164. **
  165. ** lsmMCursorNew()
  166. ** lsmMCursorSeek()
  167. ** lsmMCursorNext()
  168. ** lsmMCursorPrev()
  169. ** lsmMCursorFirst()
  170. ** lsmMCursorLast()
  171. ** lsmMCursorKey()
  172. ** lsmMCursorValue()
  173. ** lsmMCursorValid()
  174. **
  175. ** iFree:
  176. ** This variable is only used by cursors providing input data for a
  177. ** new top-level segment. Such cursors only ever iterate forwards, not
  178. ** backwards.
  179. */
  180. struct MultiCursor {
  181. lsm_db *pDb; /* Connection that owns this cursor */
  182. MultiCursor *pNext; /* Next cursor owned by connection pDb */
  183. int flags; /* Mask of CURSOR_XXX flags */
  184. int eType; /* Cache of current key type */
  185. LsmBlob key; /* Cache of current key (or NULL) */
  186. LsmBlob val; /* Cache of current value */
  187. /* All the component cursors: */
  188. TreeCursor *apTreeCsr[2]; /* Up to two tree cursors */
  189. int iFree; /* Next element of free-list (-ve for eof) */
  190. SegmentPtr *aPtr; /* Array of segment pointers */
  191. int nPtr; /* Size of array aPtr[] */
  192. BtreeCursor *pBtCsr; /* b-tree cursor (db writes only) */
  193. /* Comparison results */
  194. int nTree; /* Size of aTree[] array */
  195. int *aTree; /* Array of comparison results */
  196. /* Used by cursors flushing the in-memory tree only */
  197. void *pSystemVal; /* Pointer to buffer to free */
  198. /* Used by worker cursors only */
  199. LsmPgno *pPrevMergePtr;
  200. };
  201. /*
  202. ** The following constants are used to assign integers to each component
  203. ** cursor of a multi-cursor.
  204. */
  205. #define CURSOR_DATA_TREE0 0 /* Current tree cursor (apTreeCsr[0]) */
  206. #define CURSOR_DATA_TREE1 1 /* The "old" tree, if any (apTreeCsr[1]) */
  207. #define CURSOR_DATA_SYSTEM 2 /* Free-list entries (new-toplevel only) */
  208. #define CURSOR_DATA_SEGMENT 3 /* First segment pointer (aPtr[0]) */
  209. /*
  210. ** CURSOR_IGNORE_DELETE
  211. ** If set, this cursor will not visit SORTED_DELETE keys.
  212. **
  213. ** CURSOR_FLUSH_FREELIST
  214. ** This cursor is being used to create a new toplevel. It should also
  215. ** iterate through the contents of the in-memory free block list.
  216. **
  217. ** CURSOR_IGNORE_SYSTEM
  218. ** If set, this cursor ignores system keys.
  219. **
  220. ** CURSOR_NEXT_OK
  221. ** Set if it is Ok to call lsm_csr_next().
  222. **
  223. ** CURSOR_PREV_OK
  224. ** Set if it is Ok to call lsm_csr_prev().
  225. **
  226. ** CURSOR_READ_SEPARATORS
  227. ** Set if this cursor should visit the separator keys in segment
  228. ** aPtr[nPtr-1].
  229. **
  230. ** CURSOR_SEEK_EQ
  231. ** Cursor has undergone a successful lsm_csr_seek(LSM_SEEK_EQ) operation.
  232. ** The key and value are stored in MultiCursor.key and MultiCursor.val
  233. ** respectively.
  234. */
  235. #define CURSOR_IGNORE_DELETE 0x00000001
  236. #define CURSOR_FLUSH_FREELIST 0x00000002
  237. #define CURSOR_IGNORE_SYSTEM 0x00000010
  238. #define CURSOR_NEXT_OK 0x00000020
  239. #define CURSOR_PREV_OK 0x00000040
  240. #define CURSOR_READ_SEPARATORS 0x00000080
  241. #define CURSOR_SEEK_EQ 0x00000100
  242. typedef struct MergeWorker MergeWorker;
  243. typedef struct Hierarchy Hierarchy;
  244. struct Hierarchy {
  245. Page **apHier;
  246. int nHier;
  247. };
  248. /*
  249. ** aSave:
  250. ** When mergeWorkerNextPage() is called to advance to the next page in
  251. ** the output segment, if the bStore flag for an element of aSave[] is
  252. ** true, it is cleared and the corresponding iPgno value is set to the
  253. ** page number of the page just completed.
  254. **
  255. ** aSave[0] is used to record the pointer value to be pushed into the
  256. ** b-tree hierarchy. aSave[1] is used to save the page number of the
  257. ** page containing the indirect key most recently written to the b-tree.
  258. ** see mergeWorkerPushHierarchy() for details.
  259. */
  260. struct MergeWorker {
  261. lsm_db *pDb; /* Database handle */
  262. Level *pLevel; /* Worker snapshot Level being merged */
  263. MultiCursor *pCsr; /* Cursor to read new segment contents from */
  264. int bFlush; /* True if this is an in-memory tree flush */
  265. Hierarchy hier; /* B-tree hierarchy under construction */
  266. Page *pPage; /* Current output page */
  267. int nWork; /* Number of calls to mergeWorkerNextPage() */
  268. LsmPgno *aGobble; /* Gobble point for each input segment */
  269. LsmPgno iIndirect;
  270. struct SavedPgno {
  271. LsmPgno iPgno;
  272. int bStore;
  273. } aSave[2];
  274. };
  275. #ifdef LSM_DEBUG_EXPENSIVE
  276. static int assertPointersOk(lsm_db *, Segment *, Segment *, int);
  277. static int assertBtreeOk(lsm_db *, Segment *);
  278. static void assertRunInOrder(lsm_db *pDb, Segment *pSeg);
  279. #else
  280. #define assertRunInOrder(x,y)
  281. #define assertBtreeOk(x,y)
  282. #endif
  283. struct FilePage { u8 *aData; int nData; };
  284. static u8 *fsPageData(Page *pPg, int *pnData){
  285. *pnData = ((struct FilePage *)(pPg))->nData;
  286. return ((struct FilePage *)(pPg))->aData;
  287. }
  288. /*UNUSED static u8 *fsPageDataPtr(Page *pPg){
  289. return ((struct FilePage *)(pPg))->aData;
  290. }*/
  291. /*
  292. ** Write nVal as a 16-bit unsigned big-endian integer into buffer aOut.
  293. */
  294. void lsmPutU16(u8 *aOut, u16 nVal){
  295. aOut[0] = (u8)((nVal>>8) & 0xFF);
  296. aOut[1] = (u8)(nVal & 0xFF);
  297. }
  298. void lsmPutU32(u8 *aOut, u32 nVal){
  299. aOut[0] = (u8)((nVal>>24) & 0xFF);
  300. aOut[1] = (u8)((nVal>>16) & 0xFF);
  301. aOut[2] = (u8)((nVal>> 8) & 0xFF);
  302. aOut[3] = (u8)((nVal ) & 0xFF);
  303. }
  304. int lsmGetU16(u8 *aOut){
  305. return (aOut[0] << 8) + aOut[1];
  306. }
  307. u32 lsmGetU32(u8 *aOut){
  308. return ((u32)aOut[0] << 24)
  309. + ((u32)aOut[1] << 16)
  310. + ((u32)aOut[2] << 8)
  311. + ((u32)aOut[3]);
  312. }
  313. u64 lsmGetU64(u8 *aOut){
  314. return ((u64)aOut[0] << 56)
  315. + ((u64)aOut[1] << 48)
  316. + ((u64)aOut[2] << 40)
  317. + ((u64)aOut[3] << 32)
  318. + ((u64)aOut[4] << 24)
  319. + ((u32)aOut[5] << 16)
  320. + ((u32)aOut[6] << 8)
  321. + ((u32)aOut[7]);
  322. }
  323. void lsmPutU64(u8 *aOut, u64 nVal){
  324. aOut[0] = (u8)((nVal>>56) & 0xFF);
  325. aOut[1] = (u8)((nVal>>48) & 0xFF);
  326. aOut[2] = (u8)((nVal>>40) & 0xFF);
  327. aOut[3] = (u8)((nVal>>32) & 0xFF);
  328. aOut[4] = (u8)((nVal>>24) & 0xFF);
  329. aOut[5] = (u8)((nVal>>16) & 0xFF);
  330. aOut[6] = (u8)((nVal>> 8) & 0xFF);
  331. aOut[7] = (u8)((nVal ) & 0xFF);
  332. }
  333. static int sortedBlobGrow(lsm_env *pEnv, LsmBlob *pBlob, int nData){
  334. assert( pBlob->pEnv==pEnv || (pBlob->pEnv==0 && pBlob->pData==0) );
  335. if( pBlob->nAlloc<nData ){
  336. pBlob->pData = lsmReallocOrFree(pEnv, pBlob->pData, nData);
  337. if( !pBlob->pData ) return LSM_NOMEM_BKPT;
  338. pBlob->nAlloc = nData;
  339. pBlob->pEnv = pEnv;
  340. }
  341. return LSM_OK;
  342. }
  343. static int sortedBlobSet(lsm_env *pEnv, LsmBlob *pBlob, void *pData, int nData){
  344. if( sortedBlobGrow(pEnv, pBlob, nData) ) return LSM_NOMEM;
  345. memcpy(pBlob->pData, pData, nData);
  346. pBlob->nData = nData;
  347. return LSM_OK;
  348. }
  349. #if 0
  350. static int sortedBlobCopy(LsmBlob *pDest, LsmBlob *pSrc){
  351. return sortedBlobSet(pDest, pSrc->pData, pSrc->nData);
  352. }
  353. #endif
  354. static void sortedBlobFree(LsmBlob *pBlob){
  355. assert( pBlob->pEnv || pBlob->pData==0 );
  356. if( pBlob->pData ) lsmFree(pBlob->pEnv, pBlob->pData);
  357. memset(pBlob, 0, sizeof(LsmBlob));
  358. }
  359. static int sortedReadData(
  360. Segment *pSeg,
  361. Page *pPg,
  362. int iOff,
  363. int nByte,
  364. void **ppData,
  365. LsmBlob *pBlob
  366. ){
  367. int rc = LSM_OK;
  368. int iEnd;
  369. int nData;
  370. int nCell;
  371. u8 *aData;
  372. aData = fsPageData(pPg, &nData);
  373. nCell = lsmGetU16(&aData[SEGMENT_NRECORD_OFFSET(nData)]);
  374. iEnd = SEGMENT_EOF(nData, nCell);
  375. assert( iEnd>0 && iEnd<nData );
  376. if( iOff+nByte<=iEnd ){
  377. *ppData = (void *)&aData[iOff];
  378. }else{
  379. int nRem = nByte;
  380. int i = iOff;
  381. u8 *aDest;
  382. /* Make sure the blob is big enough to store the value being loaded. */
  383. rc = sortedBlobGrow(lsmPageEnv(pPg), pBlob, nByte);
  384. if( rc!=LSM_OK ) return rc;
  385. pBlob->nData = nByte;
  386. aDest = (u8 *)pBlob->pData;
  387. *ppData = pBlob->pData;
  388. /* Increment the pointer pages ref-count. */
  389. lsmFsPageRef(pPg);
  390. while( rc==LSM_OK ){
  391. Page *pNext;
  392. int flags;
  393. /* Copy data from pPg into the output buffer. */
  394. int nCopy = LSM_MIN(nRem, iEnd-i);
  395. if( nCopy>0 ){
  396. memcpy(&aDest[nByte-nRem], &aData[i], nCopy);
  397. nRem -= nCopy;
  398. i += nCopy;
  399. assert( nRem==0 || i==iEnd );
  400. }
  401. assert( nRem>=0 );
  402. if( nRem==0 ) break;
  403. i -= iEnd;
  404. /* Grab the next page in the segment */
  405. do {
  406. rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext);
  407. if( rc==LSM_OK && pNext==0 ){
  408. rc = LSM_CORRUPT_BKPT;
  409. }
  410. if( rc ) break;
  411. lsmFsPageRelease(pPg);
  412. pPg = pNext;
  413. aData = fsPageData(pPg, &nData);
  414. flags = lsmGetU16(&aData[SEGMENT_FLAGS_OFFSET(nData)]);
  415. }while( flags&SEGMENT_BTREE_FLAG );
  416. iEnd = SEGMENT_EOF(nData, lsmGetU16(&aData[nData-2]));
  417. assert( iEnd>0 && iEnd<nData );
  418. }
  419. lsmFsPageRelease(pPg);
  420. }
  421. return rc;
  422. }
  423. static int pageGetNRec(u8 *aData, int nData){
  424. return (int)lsmGetU16(&aData[SEGMENT_NRECORD_OFFSET(nData)]);
  425. }
  426. static LsmPgno pageGetPtr(u8 *aData, int nData){
  427. return (LsmPgno)lsmGetU64(&aData[SEGMENT_POINTER_OFFSET(nData)]);
  428. }
  429. static int pageGetFlags(u8 *aData, int nData){
  430. return (int)lsmGetU16(&aData[SEGMENT_FLAGS_OFFSET(nData)]);
  431. }
  432. static u8 *pageGetCell(u8 *aData, int nData, int iCell){
  433. return &aData[lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, iCell)])];
  434. }
  435. /*
  436. ** Return the number of cells on page pPg.
  437. */
  438. static int pageObjGetNRec(Page *pPg){
  439. int nData;
  440. u8 *aData = lsmFsPageData(pPg, &nData);
  441. return pageGetNRec(aData, nData);
  442. }
  443. /*
  444. ** Return the decoded (possibly relative) pointer value stored in cell
  445. ** iCell from page aData/nData.
  446. */
  447. static LsmPgno pageGetRecordPtr(u8 *aData, int nData, int iCell){
  448. LsmPgno iRet; /* Return value */
  449. u8 *aCell; /* Pointer to cell iCell */
  450. assert( iCell<pageGetNRec(aData, nData) && iCell>=0 );
  451. aCell = pageGetCell(aData, nData, iCell);
  452. lsmVarintGet64(&aCell[1], &iRet);
  453. return iRet;
  454. }
  455. static u8 *pageGetKey(
  456. Segment *pSeg, /* Segment pPg belongs to */
  457. Page *pPg, /* Page to read from */
  458. int iCell, /* Index of cell on page to read */
  459. int *piTopic, /* OUT: Topic associated with this key */
  460. int *pnKey, /* OUT: Size of key in bytes */
  461. LsmBlob *pBlob /* If required, use this for dynamic memory */
  462. ){
  463. u8 *pKey;
  464. i64 nDummy;
  465. int eType;
  466. u8 *aData;
  467. int nData;
  468. aData = fsPageData(pPg, &nData);
  469. assert( !(pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG) );
  470. assert( iCell<pageGetNRec(aData, nData) );
  471. pKey = pageGetCell(aData, nData, iCell);
  472. eType = *pKey++;
  473. pKey += lsmVarintGet64(pKey, &nDummy);
  474. pKey += lsmVarintGet32(pKey, pnKey);
  475. if( rtIsWrite(eType) ){
  476. pKey += lsmVarintGet64(pKey, &nDummy);
  477. }
  478. *piTopic = rtTopic(eType);
  479. sortedReadData(pSeg, pPg, pKey-aData, *pnKey, (void **)&pKey, pBlob);
  480. return pKey;
  481. }
  482. static int pageGetKeyCopy(
  483. lsm_env *pEnv, /* Environment handle */
  484. Segment *pSeg, /* Segment pPg belongs to */
  485. Page *pPg, /* Page to read from */
  486. int iCell, /* Index of cell on page to read */
  487. int *piTopic, /* OUT: Topic associated with this key */
  488. LsmBlob *pBlob /* If required, use this for dynamic memory */
  489. ){
  490. int rc = LSM_OK;
  491. int nKey;
  492. u8 *aKey;
  493. aKey = pageGetKey(pSeg, pPg, iCell, piTopic, &nKey, pBlob);
  494. assert( (void *)aKey!=pBlob->pData || nKey==pBlob->nData );
  495. if( (void *)aKey!=pBlob->pData ){
  496. rc = sortedBlobSet(pEnv, pBlob, aKey, nKey);
  497. }
  498. return rc;
  499. }
  500. static LsmPgno pageGetBtreeRef(Page *pPg, int iKey){
  501. LsmPgno iRef;
  502. u8 *aData;
  503. int nData;
  504. u8 *aCell;
  505. aData = fsPageData(pPg, &nData);
  506. aCell = pageGetCell(aData, nData, iKey);
  507. assert( aCell[0]==0 );
  508. aCell++;
  509. aCell += lsmVarintGet64(aCell, &iRef);
  510. lsmVarintGet64(aCell, &iRef);
  511. assert( iRef>0 );
  512. return iRef;
  513. }
  514. #define GETVARINT64(a, i) (((i)=((u8*)(a))[0])<=240?1:lsmVarintGet64((a), &(i)))
  515. #define GETVARINT32(a, i) (((i)=((u8*)(a))[0])<=240?1:lsmVarintGet32((a), &(i)))
  516. static int pageGetBtreeKey(
  517. Segment *pSeg, /* Segment page pPg belongs to */
  518. Page *pPg,
  519. int iKey,
  520. LsmPgno *piPtr,
  521. int *piTopic,
  522. void **ppKey,
  523. int *pnKey,
  524. LsmBlob *pBlob
  525. ){
  526. u8 *aData;
  527. int nData;
  528. u8 *aCell;
  529. int eType;
  530. aData = fsPageData(pPg, &nData);
  531. assert( SEGMENT_BTREE_FLAG & pageGetFlags(aData, nData) );
  532. assert( iKey>=0 && iKey<pageGetNRec(aData, nData) );
  533. aCell = pageGetCell(aData, nData, iKey);
  534. eType = *aCell++;
  535. aCell += GETVARINT64(aCell, *piPtr);
  536. if( eType==0 ){
  537. int rc;
  538. LsmPgno iRef; /* Page number of referenced page */
  539. Page *pRef;
  540. aCell += GETVARINT64(aCell, iRef);
  541. rc = lsmFsDbPageGet(lsmPageFS(pPg), pSeg, iRef, &pRef);
  542. if( rc!=LSM_OK ) return rc;
  543. pageGetKeyCopy(lsmPageEnv(pPg), pSeg, pRef, 0, &eType, pBlob);
  544. lsmFsPageRelease(pRef);
  545. *ppKey = pBlob->pData;
  546. *pnKey = pBlob->nData;
  547. }else{
  548. aCell += GETVARINT32(aCell, *pnKey);
  549. *ppKey = aCell;
  550. }
  551. if( piTopic ) *piTopic = rtTopic(eType);
  552. return LSM_OK;
  553. }
  554. static int btreeCursorLoadKey(BtreeCursor *pCsr){
  555. int rc = LSM_OK;
  556. if( pCsr->iPg<0 ){
  557. pCsr->pKey = 0;
  558. pCsr->nKey = 0;
  559. pCsr->eType = 0;
  560. }else{
  561. LsmPgno dummy;
  562. int iPg = pCsr->iPg;
  563. int iCell = pCsr->aPg[iPg].iCell;
  564. while( iCell<0 && (--iPg)>=0 ){
  565. iCell = pCsr->aPg[iPg].iCell-1;
  566. }
  567. if( iPg<0 || iCell<0 ) return LSM_CORRUPT_BKPT;
  568. rc = pageGetBtreeKey(
  569. pCsr->pSeg,
  570. pCsr->aPg[iPg].pPage, iCell,
  571. &dummy, &pCsr->eType, &pCsr->pKey, &pCsr->nKey, &pCsr->blob
  572. );
  573. pCsr->eType |= LSM_SEPARATOR;
  574. }
  575. return rc;
  576. }
  577. static LsmPgno btreeCursorPtr(u8 *aData, int nData, int iCell){
  578. int nCell;
  579. nCell = pageGetNRec(aData, nData);
  580. if( iCell>=nCell ){
  581. return pageGetPtr(aData, nData);
  582. }
  583. return pageGetRecordPtr(aData, nData, iCell);
  584. }
  585. static int btreeCursorNext(BtreeCursor *pCsr){
  586. int rc = LSM_OK;
  587. BtreePg *pPg = &pCsr->aPg[pCsr->iPg];
  588. int nCell;
  589. u8 *aData;
  590. int nData;
  591. assert( pCsr->iPg>=0 );
  592. assert( pCsr->iPg==pCsr->nDepth-1 );
  593. aData = fsPageData(pPg->pPage, &nData);
  594. nCell = pageGetNRec(aData, nData);
  595. assert( pPg->iCell<=nCell );
  596. pPg->iCell++;
  597. if( pPg->iCell==nCell ){
  598. LsmPgno iLoad;
  599. /* Up to parent. */
  600. lsmFsPageRelease(pPg->pPage);
  601. pPg->pPage = 0;
  602. pCsr->iPg--;
  603. while( pCsr->iPg>=0 ){
  604. pPg = &pCsr->aPg[pCsr->iPg];
  605. aData = fsPageData(pPg->pPage, &nData);
  606. if( pPg->iCell<pageGetNRec(aData, nData) ) break;
  607. lsmFsPageRelease(pPg->pPage);
  608. pCsr->iPg--;
  609. }
  610. /* Read the key */
  611. rc = btreeCursorLoadKey(pCsr);
  612. /* Unless the cursor is at EOF, descend to cell -1 (yes, negative one) of
  613. ** the left-most most descendent. */
  614. if( pCsr->iPg>=0 ){
  615. pCsr->aPg[pCsr->iPg].iCell++;
  616. iLoad = btreeCursorPtr(aData, nData, pPg->iCell);
  617. do {
  618. Page *pLoad;
  619. pCsr->iPg++;
  620. rc = lsmFsDbPageGet(pCsr->pFS, pCsr->pSeg, iLoad, &pLoad);
  621. pCsr->aPg[pCsr->iPg].pPage = pLoad;
  622. pCsr->aPg[pCsr->iPg].iCell = 0;
  623. if( rc==LSM_OK ){
  624. if( pCsr->iPg==(pCsr->nDepth-1) ) break;
  625. aData = fsPageData(pLoad, &nData);
  626. iLoad = btreeCursorPtr(aData, nData, 0);
  627. }
  628. }while( rc==LSM_OK && pCsr->iPg<(pCsr->nDepth-1) );
  629. pCsr->aPg[pCsr->iPg].iCell = -1;
  630. }
  631. }else{
  632. rc = btreeCursorLoadKey(pCsr);
  633. }
  634. if( rc==LSM_OK && pCsr->iPg>=0 ){
  635. aData = fsPageData(pCsr->aPg[pCsr->iPg].pPage, &nData);
  636. pCsr->iPtr = btreeCursorPtr(aData, nData, pCsr->aPg[pCsr->iPg].iCell+1);
  637. }
  638. return rc;
  639. }
  640. static void btreeCursorFree(BtreeCursor *pCsr){
  641. if( pCsr ){
  642. int i;
  643. lsm_env *pEnv = lsmFsEnv(pCsr->pFS);
  644. for(i=0; i<=pCsr->iPg; i++){
  645. lsmFsPageRelease(pCsr->aPg[i].pPage);
  646. }
  647. sortedBlobFree(&pCsr->blob);
  648. lsmFree(pEnv, pCsr->aPg);
  649. lsmFree(pEnv, pCsr);
  650. }
  651. }
  652. static int btreeCursorFirst(BtreeCursor *pCsr){
  653. int rc;
  654. Page *pPg = 0;
  655. FileSystem *pFS = pCsr->pFS;
  656. LsmPgno iPg = pCsr->pSeg->iRoot;
  657. do {
  658. rc = lsmFsDbPageGet(pFS, pCsr->pSeg, iPg, &pPg);
  659. assert( (rc==LSM_OK)==(pPg!=0) );
  660. if( rc==LSM_OK ){
  661. u8 *aData;
  662. int nData;
  663. int flags;
  664. aData = fsPageData(pPg, &nData);
  665. flags = pageGetFlags(aData, nData);
  666. if( (flags & SEGMENT_BTREE_FLAG)==0 ) break;
  667. if( (pCsr->nDepth % 8)==0 ){
  668. int nNew = pCsr->nDepth + 8;
  669. pCsr->aPg = (BtreePg *)lsmReallocOrFreeRc(
  670. lsmFsEnv(pFS), pCsr->aPg, sizeof(BtreePg) * nNew, &rc
  671. );
  672. if( rc==LSM_OK ){
  673. memset(&pCsr->aPg[pCsr->nDepth], 0, sizeof(BtreePg) * 8);
  674. }
  675. }
  676. if( rc==LSM_OK ){
  677. assert( pCsr->aPg[pCsr->nDepth].iCell==0 );
  678. pCsr->aPg[pCsr->nDepth].pPage = pPg;
  679. pCsr->nDepth++;
  680. iPg = pageGetRecordPtr(aData, nData, 0);
  681. }
  682. }
  683. }while( rc==LSM_OK );
  684. lsmFsPageRelease(pPg);
  685. pCsr->iPg = pCsr->nDepth-1;
  686. if( rc==LSM_OK && pCsr->nDepth ){
  687. pCsr->aPg[pCsr->iPg].iCell = -1;
  688. rc = btreeCursorNext(pCsr);
  689. }
  690. return rc;
  691. }
  692. static void btreeCursorPosition(BtreeCursor *pCsr, MergeInput *p){
  693. if( pCsr->iPg>=0 ){
  694. p->iPg = lsmFsPageNumber(pCsr->aPg[pCsr->iPg].pPage);
  695. p->iCell = ((pCsr->aPg[pCsr->iPg].iCell + 1) << 8) + pCsr->nDepth;
  696. }else{
  697. p->iPg = 0;
  698. p->iCell = 0;
  699. }
  700. }
  701. static void btreeCursorSplitkey(BtreeCursor *pCsr, MergeInput *p){
  702. int iCell = pCsr->aPg[pCsr->iPg].iCell;
  703. if( iCell>=0 ){
  704. p->iCell = iCell;
  705. p->iPg = lsmFsPageNumber(pCsr->aPg[pCsr->iPg].pPage);
  706. }else{
  707. int i;
  708. for(i=pCsr->iPg-1; i>=0; i--){
  709. if( pCsr->aPg[i].iCell>0 ) break;
  710. }
  711. assert( i>=0 );
  712. p->iCell = pCsr->aPg[i].iCell-1;
  713. p->iPg = lsmFsPageNumber(pCsr->aPg[i].pPage);
  714. }
  715. }
  716. static int sortedKeyCompare(
  717. int (*xCmp)(void *, int, void *, int),
  718. int iLhsTopic, void *pLhsKey, int nLhsKey,
  719. int iRhsTopic, void *pRhsKey, int nRhsKey
  720. ){
  721. int res = iLhsTopic - iRhsTopic;
  722. if( res==0 ){
  723. res = xCmp(pLhsKey, nLhsKey, pRhsKey, nRhsKey);
  724. }
  725. return res;
  726. }
  727. static int btreeCursorRestore(
  728. BtreeCursor *pCsr,
  729. int (*xCmp)(void *, int, void *, int),
  730. MergeInput *p
  731. ){
  732. int rc = LSM_OK;
  733. if( p->iPg ){
  734. lsm_env *pEnv = lsmFsEnv(pCsr->pFS);
  735. int iCell; /* Current cell number on leaf page */
  736. LsmPgno iLeaf; /* Page number of current leaf page */
  737. int nDepth; /* Depth of b-tree structure */
  738. Segment *pSeg = pCsr->pSeg;
  739. /* Decode the MergeInput structure */
  740. iLeaf = p->iPg;
  741. nDepth = (p->iCell & 0x00FF);
  742. iCell = (p->iCell >> 8) - 1;
  743. /* Allocate the BtreeCursor.aPg[] array */
  744. assert( pCsr->aPg==0 );
  745. pCsr->aPg = (BtreePg *)lsmMallocZeroRc(pEnv, sizeof(BtreePg) * nDepth, &rc);
  746. /* Populate the last entry of the aPg[] array */
  747. if( rc==LSM_OK ){
  748. Page **pp = &pCsr->aPg[nDepth-1].pPage;
  749. pCsr->iPg = nDepth-1;
  750. pCsr->nDepth = nDepth;
  751. pCsr->aPg[pCsr->iPg].iCell = iCell;
  752. rc = lsmFsDbPageGet(pCsr->pFS, pSeg, iLeaf, pp);
  753. }
  754. /* Populate any other aPg[] array entries */
  755. if( rc==LSM_OK && nDepth>1 ){
  756. LsmBlob blob = {0,0,0};
  757. void *pSeek;
  758. int nSeek;
  759. int iTopicSeek;
  760. int iPg = 0;
  761. LsmPgno iLoad = pSeg->iRoot;
  762. Page *pPg = pCsr->aPg[nDepth-1].pPage;
  763. if( pageObjGetNRec(pPg)==0 ){
  764. /* This can happen when pPg is the right-most leaf in the b-tree.
  765. ** In this case, set the iTopicSeek/pSeek/nSeek key to a value
  766. ** greater than any real key. */
  767. assert( iCell==-1 );
  768. iTopicSeek = 1000;
  769. pSeek = 0;
  770. nSeek = 0;
  771. }else{
  772. LsmPgno dummy;
  773. rc = pageGetBtreeKey(pSeg, pPg,
  774. 0, &dummy, &iTopicSeek, &pSeek, &nSeek, &pCsr->blob
  775. );
  776. }
  777. do {
  778. Page *pPg2;
  779. rc = lsmFsDbPageGet(pCsr->pFS, pSeg, iLoad, &pPg2);
  780. assert( rc==LSM_OK || pPg2==0 );
  781. if( rc==LSM_OK ){
  782. u8 *aData; /* Buffer containing page data */
  783. int nData; /* Size of aData[] in bytes */
  784. int iMin;
  785. int iMax;
  786. int iCell2;
  787. aData = fsPageData(pPg2, &nData);
  788. assert( (pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG) );
  789. iLoad = pageGetPtr(aData, nData);
  790. iCell2 = pageGetNRec(aData, nData);
  791. iMax = iCell2-1;
  792. iMin = 0;
  793. while( iMax>=iMin ){
  794. int iTry = (iMin+iMax)/2;
  795. void *pKey; int nKey; /* Key for cell iTry */
  796. int iTopic; /* Topic for key pKeyT/nKeyT */
  797. LsmPgno iPtr; /* Pointer for cell iTry */
  798. int res; /* (pSeek - pKeyT) */
  799. rc = pageGetBtreeKey(
  800. pSeg, pPg2, iTry, &iPtr, &iTopic, &pKey, &nKey, &blob
  801. );
  802. if( rc!=LSM_OK ) break;
  803. res = sortedKeyCompare(
  804. xCmp, iTopicSeek, pSeek, nSeek, iTopic, pKey, nKey
  805. );
  806. assert( res!=0 );
  807. if( res<0 ){
  808. iLoad = iPtr;
  809. iCell2 = iTry;
  810. iMax = iTry-1;
  811. }else{
  812. iMin = iTry+1;
  813. }
  814. }
  815. pCsr->aPg[iPg].pPage = pPg2;
  816. pCsr->aPg[iPg].iCell = iCell2;
  817. iPg++;
  818. assert( iPg!=nDepth-1
  819. || lsmFsRedirectPage(pCsr->pFS, pSeg->pRedirect, iLoad)==iLeaf
  820. );
  821. }
  822. }while( rc==LSM_OK && iPg<(nDepth-1) );
  823. sortedBlobFree(&blob);
  824. }
  825. /* Load the current key and pointer */
  826. if( rc==LSM_OK ){
  827. BtreePg *pBtreePg;
  828. u8 *aData;
  829. int nData;
  830. pBtreePg = &pCsr->aPg[pCsr->iPg];
  831. aData = fsPageData(pBtreePg->pPage, &nData);
  832. pCsr->iPtr = btreeCursorPtr(aData, nData, pBtreePg->iCell+1);
  833. if( pBtreePg->iCell<0 ){
  834. LsmPgno dummy;
  835. int i;
  836. for(i=pCsr->iPg-1; i>=0; i--){
  837. if( pCsr->aPg[i].iCell>0 ) break;
  838. }
  839. assert( i>=0 );
  840. rc = pageGetBtreeKey(pSeg,
  841. pCsr->aPg[i].pPage, pCsr->aPg[i].iCell-1,
  842. &dummy, &pCsr->eType, &pCsr->pKey, &pCsr->nKey, &pCsr->blob
  843. );
  844. pCsr->eType |= LSM_SEPARATOR;
  845. }else{
  846. rc = btreeCursorLoadKey(pCsr);
  847. }
  848. }
  849. }
  850. return rc;
  851. }
  852. static int btreeCursorNew(
  853. lsm_db *pDb,
  854. Segment *pSeg,
  855. BtreeCursor **ppCsr
  856. ){
  857. int rc = LSM_OK;
  858. BtreeCursor *pCsr;
  859. assert( pSeg->iRoot );
  860. pCsr = lsmMallocZeroRc(pDb->pEnv, sizeof(BtreeCursor), &rc);
  861. if( pCsr ){
  862. pCsr->pFS = pDb->pFS;
  863. pCsr->pSeg = pSeg;
  864. pCsr->iPg = -1;
  865. }
  866. *ppCsr = pCsr;
  867. return rc;
  868. }
  869. static void segmentPtrSetPage(SegmentPtr *pPtr, Page *pNext){
  870. lsmFsPageRelease(pPtr->pPg);
  871. if( pNext ){
  872. int nData;
  873. u8 *aData = fsPageData(pNext, &nData);
  874. pPtr->nCell = pageGetNRec(aData, nData);
  875. pPtr->flags = (u16)pageGetFlags(aData, nData);
  876. pPtr->iPtr = pageGetPtr(aData, nData);
  877. }
  878. pPtr->pPg = pNext;
  879. }
  880. /*
  881. ** Load a new page into the SegmentPtr object pPtr.
  882. */
  883. static int segmentPtrLoadPage(
  884. FileSystem *pFS,
  885. SegmentPtr *pPtr, /* Load page into this SegmentPtr object */
  886. LsmPgno iNew /* Page number of new page */
  887. ){
  888. Page *pPg = 0; /* The new page */
  889. int rc; /* Return Code */
  890. rc = lsmFsDbPageGet(pFS, pPtr->pSeg, iNew, &pPg);
  891. assert( rc==LSM_OK || pPg==0 );
  892. segmentPtrSetPage(pPtr, pPg);
  893. return rc;
  894. }
  895. static int segmentPtrReadData(
  896. SegmentPtr *pPtr,
  897. int iOff,
  898. int nByte,
  899. void **ppData,
  900. LsmBlob *pBlob
  901. ){
  902. return sortedReadData(pPtr->pSeg, pPtr->pPg, iOff, nByte, ppData, pBlob);
  903. }
  904. static int segmentPtrNextPage(
  905. SegmentPtr *pPtr, /* Load page into this SegmentPtr object */
  906. int eDir /* +1 for next(), -1 for prev() */
  907. ){
  908. Page *pNext; /* New page to load */
  909. int rc; /* Return code */
  910. assert( eDir==1 || eDir==-1 );
  911. assert( pPtr->pPg );
  912. assert( pPtr->pSeg || eDir>0 );
  913. rc = lsmFsDbPageNext(pPtr->pSeg, pPtr->pPg, eDir, &pNext);
  914. assert( rc==LSM_OK || pNext==0 );
  915. segmentPtrSetPage(pPtr, pNext);
  916. return rc;
  917. }
  918. static int segmentPtrLoadCell(
  919. SegmentPtr *pPtr, /* Load page into this SegmentPtr object */
  920. int iNew /* Cell number of new cell */
  921. ){
  922. int rc = LSM_OK;
  923. if( pPtr->pPg ){
  924. u8 *aData; /* Pointer to page data buffer */
  925. int iOff; /* Offset in aData[] to read from */
  926. int nPgsz; /* Size of page (aData[]) in bytes */
  927. assert( iNew<pPtr->nCell );
  928. pPtr->iCell = iNew;
  929. aData = fsPageData(pPtr->pPg, &nPgsz);
  930. iOff = lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nPgsz, pPtr->iCell)]);
  931. pPtr->eType = aData[iOff];
  932. iOff++;
  933. iOff += GETVARINT64(&aData[iOff], pPtr->iPgPtr);
  934. iOff += GETVARINT32(&aData[iOff], pPtr->nKey);
  935. if( rtIsWrite(pPtr->eType) ){
  936. iOff += GETVARINT32(&aData[iOff], pPtr->nVal);
  937. }
  938. assert( pPtr->nKey>=0 );
  939. rc = segmentPtrReadData(
  940. pPtr, iOff, pPtr->nKey, &pPtr->pKey, &pPtr->blob1
  941. );
  942. if( rc==LSM_OK && rtIsWrite(pPtr->eType) ){
  943. rc = segmentPtrReadData(
  944. pPtr, iOff+pPtr->nKey, pPtr->nVal, &pPtr->pVal, &pPtr->blob2
  945. );
  946. }else{
  947. pPtr->nVal = 0;
  948. pPtr->pVal = 0;
  949. }
  950. }
  951. return rc;
  952. }
  953. static Segment *sortedSplitkeySegment(Level *pLevel){
  954. Merge *pMerge = pLevel->pMerge;
  955. MergeInput *p = &pMerge->splitkey;
  956. Segment *pSeg;
  957. int i;
  958. for(i=0; i<pMerge->nInput; i++){
  959. if( p->iPg==pMerge->aInput[i].iPg ) break;
  960. }
  961. if( pMerge->nInput==(pLevel->nRight+1) && i>=(pMerge->nInput-1) ){
  962. pSeg = &pLevel->pNext->lhs;
  963. }else{
  964. pSeg = &pLevel->aRhs[i];
  965. }
  966. return pSeg;
  967. }
  968. static void sortedSplitkey(lsm_db *pDb, Level *pLevel, int *pRc){
  969. Segment *pSeg;
  970. Page *pPg = 0;
  971. lsm_env *pEnv = pDb->pEnv; /* Environment handle */
  972. int rc = *pRc;
  973. Merge *pMerge = pLevel->pMerge;
  974. pSeg = sortedSplitkeySegment(pLevel);
  975. if( rc==LSM_OK ){
  976. rc = lsmFsDbPageGet(pDb->pFS, pSeg, pMerge->splitkey.iPg, &pPg);
  977. }
  978. if( rc==LSM_OK ){
  979. int iTopic;
  980. LsmBlob blob = {0, 0, 0, 0};
  981. u8 *aData;
  982. int nData;
  983. aData = lsmFsPageData(pPg, &nData);
  984. if( pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG ){
  985. void *pKey;
  986. int nKey;
  987. LsmPgno dummy;
  988. rc = pageGetBtreeKey(pSeg,
  989. pPg, pMerge->splitkey.iCell, &dummy, &iTopic, &pKey, &nKey, &blob
  990. );
  991. if( rc==LSM_OK && blob.pData!=pKey ){
  992. rc = sortedBlobSet(pEnv, &blob, pKey, nKey);
  993. }
  994. }else{
  995. rc = pageGetKeyCopy(
  996. pEnv, pSeg, pPg, pMerge->splitkey.iCell, &iTopic, &blob
  997. );
  998. }
  999. pLevel->iSplitTopic = iTopic;
  1000. pLevel->pSplitKey = blob.pData;
  1001. pLevel->nSplitKey = blob.nData;
  1002. lsmFsPageRelease(pPg);
  1003. }
  1004. *pRc = rc;
  1005. }
  1006. /*
  1007. ** Reset a segment cursor. Also free its buffers if they are nThreshold
  1008. ** bytes or larger in size.
  1009. */
  1010. static void segmentPtrReset(SegmentPtr *pPtr, int nThreshold){
  1011. lsmFsPageRelease(pPtr->pPg);
  1012. pPtr->pPg = 0;
  1013. pPtr->nCell = 0;
  1014. pPtr->pKey = 0;
  1015. pPtr->nKey = 0;
  1016. pPtr->pVal = 0;
  1017. pPtr->nVal = 0;
  1018. pPtr->eType = 0;
  1019. pPtr->iCell = 0;
  1020. if( pPtr->blob1.nAlloc>=nThreshold ) sortedBlobFree(&pPtr->blob1);
  1021. if( pPtr->blob2.nAlloc>=nThreshold ) sortedBlobFree(&pPtr->blob2);
  1022. }
  1023. static int segmentPtrIgnoreSeparators(MultiCursor *pCsr, SegmentPtr *pPtr){
  1024. return (pCsr->flags & CURSOR_READ_SEPARATORS)==0
  1025. || (pPtr!=&pCsr->aPtr[pCsr->nPtr-1]);
  1026. }
  1027. static int segmentPtrAdvance(
  1028. MultiCursor *pCsr,
  1029. SegmentPtr *pPtr,
  1030. int bReverse
  1031. ){
  1032. int eDir = (bReverse ? -1 : 1);
  1033. Level *pLvl = pPtr->pLevel;
  1034. do {
  1035. int rc;
  1036. int iCell; /* Number of new cell in page */
  1037. int svFlags = 0; /* SegmentPtr.eType before advance */
  1038. iCell = pPtr->iCell + eDir;
  1039. assert( pPtr->pPg );
  1040. assert( iCell<=pPtr->nCell && iCell>=-1 );
  1041. if( bReverse && pPtr->pSeg!=&pPtr->pLevel->lhs ){
  1042. svFlags = pPtr->eType;
  1043. assert( svFlags );
  1044. }
  1045. if( iCell>=pPtr->nCell || iCell<0 ){
  1046. do {
  1047. rc = segmentPtrNextPage(pPtr, eDir);
  1048. }while( rc==LSM_OK
  1049. && pPtr->pPg
  1050. && (pPtr->nCell==0 || (pPtr->flags & SEGMENT_BTREE_FLAG) )
  1051. );
  1052. if( rc!=LSM_OK ) return rc;
  1053. iCell = bReverse ? (pPtr->nCell-1) : 0;
  1054. }
  1055. rc = segmentPtrLoadCell(pPtr, iCell);
  1056. if( rc!=LSM_OK ) return rc;
  1057. if( svFlags && pPtr->pPg ){
  1058. int res = sortedKeyCompare(pCsr->pDb->xCmp,
  1059. rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey,
  1060. pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey
  1061. );
  1062. if( res<0 ) segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
  1063. }
  1064. if( pPtr->pPg==0 && (svFlags & LSM_END_DELETE) ){
  1065. Segment *pSeg = pPtr->pSeg;
  1066. rc = lsmFsDbPageGet(pCsr->pDb->pFS, pSeg, pSeg->iFirst, &pPtr->pPg);
  1067. if( rc!=LSM_OK ) return rc;
  1068. pPtr->eType = LSM_START_DELETE | LSM_POINT_DELETE;
  1069. pPtr->eType |= (pLvl->iSplitTopic ? LSM_SYSTEMKEY : 0);
  1070. pPtr->pKey = pLvl->pSplitKey;
  1071. pPtr->nKey = pLvl->nSplitKey;
  1072. }
  1073. }while( pCsr
  1074. && pPtr->pPg
  1075. && segmentPtrIgnoreSeparators(pCsr, pPtr)
  1076. && rtIsSeparator(pPtr->eType)
  1077. );
  1078. return LSM_OK;
  1079. }
  1080. static void segmentPtrEndPage(
  1081. FileSystem *pFS,
  1082. SegmentPtr *pPtr,
  1083. int bLast,
  1084. int *pRc
  1085. ){
  1086. if( *pRc==LSM_OK ){
  1087. Segment *pSeg = pPtr->pSeg;
  1088. Page *pNew = 0;
  1089. if( bLast ){
  1090. *pRc = lsmFsDbPageLast(pFS, pSeg, &pNew);
  1091. }else{
  1092. *pRc = lsmFsDbPageGet(pFS, pSeg, pSeg->iFirst, &pNew);
  1093. }
  1094. segmentPtrSetPage(pPtr, pNew);
  1095. }
  1096. }
  1097. /*
  1098. ** Try to move the segment pointer passed as the second argument so that it
  1099. ** points at either the first (bLast==0) or last (bLast==1) cell in the valid
  1100. ** region of the segment defined by pPtr->iFirst and pPtr->iLast.
  1101. **
  1102. ** Return LSM_OK if successful or an lsm error code if something goes
  1103. ** wrong (IO error, OOM etc.).
  1104. */
  1105. static int segmentPtrEnd(MultiCursor *pCsr, SegmentPtr *pPtr, int bLast){
  1106. Level *pLvl = pPtr->pLevel;
  1107. int rc = LSM_OK;
  1108. FileSystem *pFS = pCsr->pDb->pFS;
  1109. int bIgnore;
  1110. segmentPtrEndPage(pFS, pPtr, bLast, &rc);
  1111. while( rc==LSM_OK && pPtr->pPg
  1112. && (pPtr->nCell==0 || (pPtr->flags & SEGMENT_BTREE_FLAG))
  1113. ){
  1114. rc = segmentPtrNextPage(pPtr, (bLast ? -1 : 1));
  1115. }
  1116. if( rc==LSM_OK && pPtr->pPg ){
  1117. rc = segmentPtrLoadCell(pPtr, bLast ? (pPtr->nCell-1) : 0);
  1118. if( rc==LSM_OK && bLast && pPtr->pSeg!=&pLvl->lhs ){
  1119. int res = sortedKeyCompare(pCsr->pDb->xCmp,
  1120. rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey,
  1121. pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey
  1122. );
  1123. if( res<0 ) segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
  1124. }
  1125. }
  1126. bIgnore = segmentPtrIgnoreSeparators(pCsr, pPtr);
  1127. if( rc==LSM_OK && pPtr->pPg && bIgnore && rtIsSeparator(pPtr->eType) ){
  1128. rc = segmentPtrAdvance(pCsr, pPtr, bLast);
  1129. }
  1130. #if 0
  1131. if( bLast && rc==LSM_OK && pPtr->pPg
  1132. && pPtr->pSeg==&pLvl->lhs
  1133. && pLvl->nRight && (pPtr->eType & LSM_START_DELETE)
  1134. ){
  1135. pPtr->iCell++;
  1136. pPtr->eType = LSM_END_DELETE | (pLvl->iSplitTopic);
  1137. pPtr->pKey = pLvl->pSplitKey;
  1138. pPtr->nKey = pLvl->nSplitKey;
  1139. pPtr->pVal = 0;
  1140. pPtr->nVal = 0;
  1141. }
  1142. #endif
  1143. return rc;
  1144. }
  1145. static void segmentPtrKey(SegmentPtr *pPtr, void **ppKey, int *pnKey){
  1146. assert( pPtr->pPg );
  1147. *ppKey = pPtr->pKey;
  1148. *pnKey = pPtr->nKey;
  1149. }
  1150. #if 0 /* NOT USED */
  1151. static char *keyToString(lsm_env *pEnv, void *pKey, int nKey){
  1152. int i;
  1153. u8 *aKey = (u8 *)pKey;
  1154. char *zRet = (char *)lsmMalloc(pEnv, nKey+1);
  1155. for(i=0; i<nKey; i++){
  1156. zRet[i] = (char)(isalnum(aKey[i]) ? aKey[i] : '.');
  1157. }
  1158. zRet[nKey] = '\0';
  1159. return zRet;
  1160. }
  1161. #endif
  1162. #if 0 /* NOT USED */
  1163. /*
  1164. ** Check that the page that pPtr currently has loaded is the correct page
  1165. ** to search for key (pKey/nKey). If it is, return 1. Otherwise, an assert
  1166. ** fails and this function does not return.
  1167. */
  1168. static int assertKeyLocation(
  1169. MultiCursor *pCsr,
  1170. SegmentPtr *pPtr,
  1171. void *pKey, int nKey
  1172. ){
  1173. lsm_env *pEnv = lsmFsEnv(pCsr->pDb->pFS);
  1174. LsmBlob blob = {0, 0, 0};
  1175. int eDir;
  1176. int iTopic = 0; /* TODO: Fix me */
  1177. for(eDir=-1; eDir<=1; eDir+=2){
  1178. Page *pTest = pPtr->pPg;
  1179. lsmFsPageRef(pTest);
  1180. while( pTest ){
  1181. Segment *pSeg = pPtr->pSeg;
  1182. Page *pNext;
  1183. int rc = lsmFsDbPageNext(pSeg, pTest, eDir, &pNext);
  1184. lsmFsPageRelease(pTest);
  1185. if( rc ) return 1;
  1186. pTest = pNext;
  1187. if( pTest ){
  1188. int nData;
  1189. u8 *aData = fsPageData(pTest, &nData);
  1190. int nCell = pageGetNRec(aData, nData);
  1191. int flags = pageGetFlags(aData, nData);
  1192. if( nCell && 0==(flags&SEGMENT_BTREE_FLAG) ){
  1193. int nPgKey;
  1194. int iPgTopic;
  1195. u8 *pPgKey;
  1196. int res;
  1197. int iCell;
  1198. iCell = ((eDir < 0) ? (nCell-1) : 0);
  1199. pPgKey = pageGetKey(pSeg, pTest, iCell, &iPgTopic, &nPgKey, &blob);
  1200. res = iTopic - iPgTopic;
  1201. if( res==0 ) res = pCsr->pDb->xCmp(pKey, nKey, pPgKey, nPgKey);
  1202. if( (eDir==1 && res>0) || (eDir==-1 && res<0) ){
  1203. /* Taking this branch means something has gone wrong. */
  1204. char *zMsg = lsmMallocPrintf(pEnv, "Key \"%s\" is not on page %d",
  1205. keyToString(pEnv, pKey, nKey), lsmFsPageNumber(pPtr->pPg)
  1206. );
  1207. fprintf(stderr, "%s\n", zMsg);
  1208. assert( !"assertKeyLocation() failed" );
  1209. }
  1210. lsmFsPageRelease(pTest);
  1211. pTest = 0;
  1212. }
  1213. }
  1214. }
  1215. }
  1216. sortedBlobFree(&blob);
  1217. return 1;
  1218. }
  1219. #endif
  1220. #ifndef NDEBUG
  1221. static int assertSeekResult(
  1222. MultiCursor *pCsr,
  1223. SegmentPtr *pPtr,
  1224. int iTopic,
  1225. void *pKey,
  1226. int nKey,
  1227. int eSeek
  1228. ){
  1229. if( pPtr->pPg ){
  1230. int res;
  1231. res = sortedKeyCompare(pCsr->pDb->xCmp, iTopic, pKey, nKey,
  1232. rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey
  1233. );
  1234. if( eSeek==LSM_SEEK_EQ ) return (res==0);
  1235. if( eSeek==LSM_SEEK_LE ) return (res>=0);
  1236. if( eSeek==LSM_SEEK_GE ) return (res<=0);
  1237. }
  1238. return 1;
  1239. }
  1240. #endif
  1241. static int segmentPtrSearchOversized(
  1242. MultiCursor *pCsr, /* Cursor context */
  1243. SegmentPtr *pPtr, /* Pointer to seek */
  1244. int iTopic, /* Topic of key to search for */
  1245. void *pKey, int nKey /* Key to seek to */
  1246. ){
  1247. int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
  1248. int rc = LSM_OK;
  1249. /* If the OVERSIZED flag is set, then there is no pointer in the
  1250. ** upper level to the next page in the segment that contains at least
  1251. ** one key. So compare the largest key on the current page with the
  1252. ** key being sought (pKey/nKey). If (pKey/nKey) is larger, advance
  1253. ** to the next page in the segment that contains at least one key.
  1254. */
  1255. while( rc==LSM_OK && (pPtr->flags & PGFTR_SKIP_NEXT_FLAG) ){
  1256. u8 *pLastKey;
  1257. int nLastKey;
  1258. int iLastTopic;
  1259. int res; /* Result of comparison */
  1260. Page *pNext;
  1261. /* Load the last key on the current page. */
  1262. pLastKey = pageGetKey(pPtr->pSeg,
  1263. pPtr->pPg, pPtr->nCell-1, &iLastTopic, &nLastKey, &pPtr->blob1
  1264. );
  1265. /* If the loaded key is >= than (pKey/nKey), break out of the loop.
  1266. ** If (pKey/nKey) is present in this array, it must be on the current
  1267. ** page. */
  1268. res = sortedKeyCompare(
  1269. xCmp, iLastTopic, pLastKey, nLastKey, iTopic, pKey, nKey
  1270. );
  1271. if( res>=0 ) break;
  1272. /* Advance to the next page that contains at least one key. */
  1273. pNext = pPtr->pPg;
  1274. lsmFsPageRef(pNext);
  1275. while( 1 ){
  1276. Page *pLoad;
  1277. u8 *aData; int nData;
  1278. rc = lsmFsDbPageNext(pPtr->pSeg, pNext, 1, &pLoad);
  1279. lsmFsPageRelease(pNext);
  1280. pNext = pLoad;
  1281. if( pNext==0 ) break;
  1282. assert( rc==LSM_OK );
  1283. aData = lsmFsPageData(pNext, &nData);
  1284. if( (pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG)==0
  1285. && pageGetNRec(aData, nData)>0
  1286. ){
  1287. break;
  1288. }
  1289. }
  1290. if( pNext==0 ) break;
  1291. segmentPtrSetPage(pPtr, pNext);
  1292. /* This should probably be an LSM_CORRUPT error. */
  1293. assert( rc!=LSM_OK || (pPtr->flags & PGFTR_SKIP_THIS_FLAG) );
  1294. }
  1295. return rc;
  1296. }
  1297. static int ptrFwdPointer(
  1298. Page *pPage,
  1299. int iCell,
  1300. Segment *pSeg,
  1301. LsmPgno *piPtr,
  1302. int *pbFound
  1303. ){
  1304. Page *pPg = pPage;
  1305. int iFirst = iCell;
  1306. int rc = LSM_OK;
  1307. do {
  1308. Page *pNext = 0;
  1309. u8 *aData;
  1310. int nData;
  1311. aData = lsmFsPageData(pPg, &nData);
  1312. if( (pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG)==0 ){
  1313. int i;
  1314. int nCell = pageGetNRec(aData, nData);
  1315. for(i=iFirst; i<nCell; i++){
  1316. u8 eType = *pageGetCell(aData, nData, i);
  1317. if( (eType & LSM_START_DELETE)==0 ){
  1318. *pbFound = 1;
  1319. *piPtr = pageGetRecordPtr(aData, nData, i) + pageGetPtr(aData, nData);
  1320. lsmFsPageRelease(pPg);
  1321. return LSM_OK;
  1322. }
  1323. }
  1324. }
  1325. rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext);
  1326. lsmFsPageRelease(pPg);
  1327. pPg = pNext;
  1328. iFirst = 0;
  1329. }while( pPg && rc==LSM_OK );
  1330. lsmFsPageRelease(pPg);
  1331. *pbFound = 0;
  1332. return rc;
  1333. }
  1334. static int sortedRhsFirst(MultiCursor *pCsr, Level *pLvl, SegmentPtr *pPtr){
  1335. int rc;
  1336. rc = segmentPtrEnd(pCsr, pPtr, 0);
  1337. while( pPtr->pPg && rc==LSM_OK ){
  1338. int res = sortedKeyCompare(pCsr->pDb->xCmp,
  1339. pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey,
  1340. rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey
  1341. );
  1342. if( res<=0 ) break;
  1343. rc = segmentPtrAdvance(pCsr, pPtr, 0);
  1344. }
  1345. return rc;
  1346. }
  1347. /*
  1348. ** This function is called as part of a SEEK_GE op on a multi-cursor if the
  1349. ** FC pointer read from segment *pPtr comes from an entry with the
  1350. ** LSM_START_DELETE flag set. In this case the pointer value cannot be
  1351. ** trusted. Instead, the pointer that should be followed is that associated
  1352. ** with the next entry in *pPtr that does not have LSM_START_DELETE set.
  1353. **
  1354. ** Why the pointers can't be trusted:
  1355. **
  1356. **
  1357. **
  1358. ** TODO: This is a stop-gap solution:
  1359. **
  1360. ** At the moment, this function is called from within segmentPtrSeek(),
  1361. ** as part of the initial lsmMCursorSeek() call. However, consider a
  1362. ** database where the following has occurred:
  1363. **
  1364. ** 1. A range delete removes keys 1..9999 using a range delete.
  1365. ** 2. Keys 1 through 9999 are reinserted.
  1366. ** 3. The levels containing the ops in 1. and 2. above are merged. Call
  1367. ** this level N. Level N contains FC pointers to level N+1.
  1368. **
  1369. ** Then, if the user attempts to query for (key>=2 LIMIT 10), the
  1370. ** lsmMCursorSeek() call will iterate through 9998 entries searching for a
  1371. ** pointer down to the level N+1 that is never actually used. It would be
  1372. ** much better if the multi-cursor could do this lazily - only seek to the
  1373. ** level (N+1) page after the user has moved the cursor on level N passed
  1374. ** the big range-delete.
  1375. */
  1376. static int segmentPtrFwdPointer(
  1377. MultiCursor *pCsr, /* Multi-cursor pPtr belongs to */
  1378. SegmentPtr *pPtr, /* Segment-pointer to extract FC ptr from */
  1379. LsmPgno *piPtr /* OUT: FC pointer value */
  1380. ){
  1381. Level *pLvl = pPtr->pLevel;
  1382. Level *pNext = pLvl->pNext;
  1383. Page *pPg = pPtr->pPg;
  1384. int rc;
  1385. int bFound;
  1386. LsmPgno iOut = 0;
  1387. if( pPtr->pSeg==&pLvl->lhs || pPtr->pSeg==&pLvl->aRhs[pLvl->nRight-1] ){
  1388. if( pNext==0
  1389. || (pNext->nRight==0 && pNext->lhs.iRoot)
  1390. || (pNext->nRight!=0 && pNext->aRhs[0].iRoot)
  1391. ){
  1392. /* Do nothing. The pointer will not be used anyway. */
  1393. return LSM_OK;
  1394. }
  1395. }else{
  1396. if( pPtr[1].pSeg->iRoot ){
  1397. return LSM_OK;
  1398. }
  1399. }
  1400. /* Search for a pointer within the current segment. */
  1401. lsmFsPageRef(pPg);
  1402. rc = ptrFwdPointer(pPg, pPtr->iCell, pPtr->pSeg, &iOut, &bFound);
  1403. if( rc==LSM_OK && bFound==0 ){
  1404. /* This case happens when pPtr points to the left-hand-side of a segment
  1405. ** currently undergoing an incremental merge. In this case, jump to the
  1406. ** oldest segment in the right-hand-side of the same level and continue
  1407. ** searching. But - do not consider any keys smaller than the levels
  1408. ** split-key. */
  1409. SegmentPtr ptr;
  1410. if( pPtr->pLevel->nRight==0 || pPtr->pSeg!=&pPtr->pLevel->lhs ){
  1411. return LSM_CORRUPT_BKPT;
  1412. }
  1413. memset(&ptr, 0, sizeof(SegmentPtr));
  1414. ptr.pLevel = pPtr->pLevel;
  1415. ptr.pSeg = &ptr.pLevel->aRhs[ptr.pLevel->nRight-1];
  1416. rc = sortedRhsFirst(pCsr, ptr.pLevel, &ptr);
  1417. if( rc==LSM_OK ){
  1418. rc = ptrFwdPointer(ptr.pPg, ptr.iCell, ptr.pSeg, &iOut, &bFound);
  1419. ptr.pPg = 0;
  1420. }
  1421. segmentPtrReset(&ptr, 0);
  1422. }
  1423. *piPtr = iOut;
  1424. return rc;
  1425. }
  1426. static int segmentPtrSeek(
  1427. MultiCursor *pCsr, /* Cursor context */
  1428. SegmentPtr *pPtr, /* Pointer to seek */
  1429. int iTopic, /* Key topic to seek to */
  1430. void *pKey, int nKey, /* Key to seek to */
  1431. int eSeek, /* Search bias - see above */
  1432. LsmPgno *piPtr, /* OUT: FC pointer */
  1433. int *pbStop
  1434. ){
  1435. int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
  1436. int res = 0; /* Result of comparison operation */
  1437. int rc = LSM_OK;
  1438. int iMin;
  1439. int iMax;
  1440. LsmPgno iPtrOut = 0;
  1441. /* If the current page contains an oversized entry, then there are no
  1442. ** pointers to one or more of the subsequent pages in the sorted run.
  1443. ** The following call ensures that the segment-ptr points to the correct
  1444. ** page in this case. */
  1445. rc = segmentPtrSearchOversized(pCsr, pPtr, iTopic, pKey, nKey);
  1446. iPtrOut = pPtr->iPtr;
  1447. /* Assert that this page is the right page of this segment for the key
  1448. ** that we are searching for. Do this by loading page (iPg-1) and testing
  1449. ** that pKey/nKey is greater than all keys on that page, and then by
  1450. ** loading (iPg+1) and testing that pKey/nKey is smaller than all
  1451. ** the keys it houses.
  1452. **
  1453. ** TODO: With range-deletes in the tree, the test described above may fail.
  1454. */
  1455. #if 0
  1456. assert( assertKeyLocation(pCsr, pPtr, pKey, nKey) );
  1457. #endif
  1458. assert( pPtr->nCell>0
  1459. || pPtr->pSeg->nSize==1
  1460. || lsmFsDbPageIsLast(pPtr->pSeg, pPtr->pPg)
  1461. );
  1462. if( pPtr->nCell==0 ){
  1463. segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
  1464. }else{
  1465. iMin = 0;
  1466. iMax = pPtr->nCell-1;
  1467. while( 1 ){
  1468. int iTry = (iMin+iMax)/2;
  1469. void *pKeyT; int nKeyT; /* Key for cell iTry */
  1470. int iTopicT;
  1471. assert( iTry<iMax || iMin==iMax );
  1472. rc = segmentPtrLoadCell(pPtr, iTry);
  1473. if( rc!=LSM_OK ) break;
  1474. segmentPtrKey(pPtr, &pKeyT, &nKeyT);
  1475. iTopicT = rtTopic(pPtr->eType);
  1476. res = sortedKeyCompare(xCmp, iTopicT, pKeyT, nKeyT, iTopic, pKey, nKey);
  1477. if( res<=0 ){
  1478. iPtrOut = pPtr->iPtr + pPtr->iPgPtr;
  1479. }
  1480. if( res==0 || iMin==iMax ){
  1481. break;
  1482. }else if( res>0 ){
  1483. iMax = LSM_MAX(iTry-1, iMin);
  1484. }else{
  1485. iMin = iTry+1;
  1486. }
  1487. }
  1488. if( rc==LSM_OK ){
  1489. assert( res==0 || (iMin==iMax && iMin>=0 && iMin<pPtr->nCell) );
  1490. if( res ){
  1491. rc = segmentPtrLoadCell(pPtr, iMin);
  1492. }
  1493. assert( rc!=LSM_OK || res>0 || iPtrOut==(pPtr->iPtr + pPtr->iPgPtr) );
  1494. if( rc==LSM_OK ){
  1495. switch( eSeek ){
  1496. case LSM_SEEK_EQ: {
  1497. int eType = pPtr->eType;
  1498. if( (res<0 && (eType & LSM_START_DELETE))
  1499. || (res>0 && (eType & LSM_END_DELETE))
  1500. || (res==0 && (eType & LSM_POINT_DELETE))
  1501. ){
  1502. *pbStop = 1;
  1503. }else if( res==0 && (eType & LSM_INSERT) ){
  1504. lsm_env *pEnv = pCsr->pDb->pEnv;
  1505. *pbStop = 1;
  1506. pCsr->eType = pPtr->eType;
  1507. rc = sortedBlobSet(pEnv, &pCsr->key, pPtr->pKey, pPtr->nKey);
  1508. if( rc==LSM_OK ){
  1509. rc = sortedBlobSet(pEnv, &pCsr->val, pPtr->pVal, pPtr->nVal);
  1510. }
  1511. pCsr->flags |= CURSOR_SEEK_EQ;
  1512. }
  1513. segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
  1514. break;
  1515. }
  1516. case LSM_SEEK_LE:
  1517. if( res>0 ) rc = segmentPtrAdvance(pCsr, pPtr, 1);
  1518. break;
  1519. case LSM_SEEK_GE: {
  1520. /* Figure out if we need to 'skip' the pointer forward or not */
  1521. if( (res<=0 && (pPtr->eType & LSM_START_DELETE))
  1522. || (res>0 && (pPtr->eType & LSM_END_DELETE))
  1523. ){
  1524. rc = segmentPtrFwdPointer(pCsr, pPtr, &iPtrOut);
  1525. }
  1526. if( res<0 && rc==LSM_OK ){
  1527. rc = segmentPtrAdvance(pCsr, pPtr, 0);
  1528. }
  1529. break;
  1530. }
  1531. }
  1532. }
  1533. }
  1534. /* If the cursor seek has found a separator key, and this cursor is
  1535. ** supposed to ignore separators keys, advance to the next entry. */
  1536. if( rc==LSM_OK && pPtr->pPg
  1537. && segmentPtrIgnoreSeparators(pCsr, pPtr)
  1538. && rtIsSeparator(pPtr->eType)
  1539. ){
  1540. assert( eSeek!=LSM_SEEK_EQ );
  1541. rc = segmentPtrAdvance(pCsr, pPtr, eSeek==LSM_SEEK_LE);
  1542. }
  1543. }
  1544. assert( rc!=LSM_OK || assertSeekResult(pCsr,pPtr,iTopic,pKey,nKey,eSeek) );
  1545. *piPtr = iPtrOut;
  1546. return rc;
  1547. }
  1548. static int seekInBtree(
  1549. MultiCursor *pCsr, /* Multi-cursor object */
  1550. Segment *pSeg, /* Seek within this segment */
  1551. int iTopic,
  1552. void *pKey, int nKey, /* Key to seek to */
  1553. LsmPgno *aPg, /* OUT: Page numbers */
  1554. Page **ppPg /* OUT: Leaf (sorted-run) page reference */
  1555. ){
  1556. int i = 0;
  1557. int rc;
  1558. LsmPgno iPg;
  1559. Page *pPg = 0;
  1560. LsmBlob blob = {0, 0, 0};
  1561. iPg = pSeg->iRoot;
  1562. do {
  1563. LsmPgno *piFirst = 0;
  1564. if( aPg ){
  1565. aPg[i++] = iPg;
  1566. piFirst = &aPg[i];
  1567. }
  1568. rc = lsmFsDbPageGet(pCsr->pDb->pFS, pSeg, iPg, &pPg);
  1569. assert( rc==LSM_OK || pPg==0 );
  1570. if( rc==LSM_OK ){
  1571. u8 *aData; /* Buffer containing page data */
  1572. int nData; /* Size of aData[] in bytes */
  1573. int iMin;
  1574. int iMax;
  1575. int nRec;
  1576. int flags;
  1577. aData = fsPageData(pPg, &nData);
  1578. flags = pageGetFlags(aData, nData);
  1579. if( (flags & SEGMENT_BTREE_FLAG)==0 ) break;
  1580. iPg = pageGetPtr(aData, nData);
  1581. nRec = pageGetNRec(aData, nData);
  1582. iMin = 0;
  1583. iMax = nRec-1;
  1584. while( iMax>=iMin ){
  1585. int iTry = (iMin+iMax)/2;
  1586. void *pKeyT; int nKeyT; /* Key for cell iTry */
  1587. int iTopicT; /* Topic for key pKeyT/nKeyT */
  1588. LsmPgno iPtr; /* Pointer associated with cell iTry */
  1589. int res; /* (pKey - pKeyT) */
  1590. rc = pageGetBtreeKey(
  1591. pSeg, pPg, iTry, &iPtr, &iTopicT, &pKeyT, &nKeyT, &blob
  1592. );
  1593. if( rc!=LSM_OK ) break;
  1594. if( piFirst && pKeyT==blob.pData ){
  1595. *piFirst = pageGetBtreeRef(pPg, iTry);
  1596. piFirst = 0;
  1597. i++;
  1598. }
  1599. res = sortedKeyCompare(
  1600. pCsr->pDb->xCmp, iTopic, pKey, nKey, iTopicT, pKeyT, nKeyT
  1601. );
  1602. if( res<0 ){
  1603. iPg = iPtr;
  1604. iMax = iTry-1;
  1605. }else{
  1606. iMin = iTry+1;
  1607. }
  1608. }
  1609. lsmFsPageRelease(pPg);
  1610. pPg = 0;
  1611. }
  1612. }while( rc==LSM_OK );
  1613. sortedBlobFree(&blob);
  1614. assert( (rc==LSM_OK)==(pPg!=0) );
  1615. if( ppPg ){
  1616. *ppPg = pPg;
  1617. }else{
  1618. lsmFsPageRelease(pPg);
  1619. }
  1620. return rc;
  1621. }
  1622. static int seekInSegment(
  1623. MultiCursor *pCsr,
  1624. SegmentPtr *pPtr,
  1625. int iTopic,
  1626. void *pKey, int nKey,
  1627. LsmPgno iPg, /* Page to search */
  1628. int eSeek, /* Search bias - see above */
  1629. LsmPgno *piPtr, /* OUT: FC pointer */
  1630. int *pbStop /* OUT: Stop search flag */
  1631. ){
  1632. LsmPgno iPtr = iPg;
  1633. int rc = LSM_OK;
  1634. if( pPtr->pSeg->iRoot ){
  1635. Page *pPg;
  1636. assert( pPtr->pSeg->iRoot!=0 );
  1637. rc = seekInBtree(pCsr, pPtr->pSeg, iTopic, pKey, nKey, 0, &pPg);
  1638. if( rc==LSM_OK ) segmentPtrSetPage(pPtr, pPg);
  1639. }else{
  1640. if( iPtr==0 ){
  1641. iPtr = pPtr->pSeg->iFirst;
  1642. }
  1643. if( rc==LSM_OK ){
  1644. rc = segmentPtrLoadPage(pCsr->pDb->pFS, pPtr, iPtr);
  1645. }
  1646. }
  1647. if( rc==LSM_OK ){
  1648. rc = segmentPtrSeek(pCsr, pPtr, iTopic, pKey, nKey, eSeek, piPtr, pbStop);
  1649. }
  1650. return rc;
  1651. }
  1652. /*
  1653. ** Seek each segment pointer in the array of (pLvl->nRight+1) at aPtr[].
  1654. **
  1655. ** pbStop:
  1656. ** This parameter is only significant if parameter eSeek is set to
  1657. ** LSM_SEEK_EQ. In this case, it is set to true before returning if
  1658. ** the seek operation is finished. This can happen in two ways:
  1659. **
  1660. ** a) A key matching (pKey/nKey) is found, or
  1661. ** b) A point-delete or range-delete deleting the key is found.
  1662. **
  1663. ** In case (a), the multi-cursor CURSOR_SEEK_EQ flag is set and the pCsr->key
  1664. ** and pCsr->val blobs populated before returning.
  1665. */
  1666. static int seekInLevel(
  1667. MultiCursor *pCsr, /* Sorted cursor object to seek */
  1668. SegmentPtr *aPtr, /* Pointer to array of (nRhs+1) SPs */
  1669. int eSeek, /* Search bias - see above */
  1670. int iTopic, /* Key topic to search for */
  1671. void *pKey, int nKey, /* Key to search for */
  1672. LsmPgno *piPgno, /* IN/OUT: fraction cascade pointer (or 0) */
  1673. int *pbStop /* OUT: See above */
  1674. ){
  1675. Level *pLvl = aPtr[0].pLevel; /* Level to seek within */
  1676. int rc = LSM_OK; /* Return code */
  1677. LsmPgno iOut = 0; /* Pointer to return to caller */
  1678. int res = -1; /* Result of xCmp(pKey, split) */
  1679. int nRhs = pLvl->nRight; /* Number of right-hand-side segments */
  1680. int bStop = 0;
  1681. /* If this is a composite level (one currently undergoing an incremental
  1682. ** merge), figure out if the search key is larger or smaller than the
  1683. ** levels split-key. */
  1684. if( nRhs ){
  1685. res = sortedKeyCompare(pCsr->pDb->xCmp, iTopic, pKey, nKey,
  1686. pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey
  1687. );
  1688. }
  1689. /* If (res<0), then key pKey/nKey is smaller than the split-key (or this
  1690. ** is not a composite level and there is no split-key). Search the
  1691. ** left-hand-side of the level in this case. */
  1692. if( res<0 ){
  1693. int i;
  1694. LsmPgno iPtr = 0;
  1695. if( nRhs==0 ) iPtr = *piPgno;
  1696. rc = seekInSegment(
  1697. pCsr, &aPtr[0], iTopic, pKey, nKey, iPtr, eSeek, &iOut, &bStop
  1698. );
  1699. if( rc==LSM_OK && nRhs>0 && eSeek==LSM_SEEK_GE && aPtr[0].pPg==0 ){
  1700. res = 0;
  1701. }
  1702. for(i=1; i<=nRhs; i++){
  1703. segmentPtrReset(&aPtr[i], LSM_SEGMENTPTR_FREE_THRESHOLD);
  1704. }
  1705. }
  1706. if( res>=0 ){
  1707. int bHit = 0; /* True if at least one rhs is not EOF */
  1708. LsmPgno iPtr = *piPgno;
  1709. int i;
  1710. segmentPtrReset(&aPtr[0], LSM_SEGMENTPTR_FREE_THRESHOLD);
  1711. for(i=1; rc==LSM_OK && i<=nRhs && bStop==0; i++){
  1712. SegmentPtr *pPtr = &aPtr[i];
  1713. iOut = 0;
  1714. rc = seekInSegment(
  1715. pCsr, pPtr, iTopic, pKey, nKey, iPtr, eSeek, &iOut, &bStop
  1716. );
  1717. iPtr = iOut;
  1718. /* If the segment-pointer has settled on a key that is smaller than
  1719. ** the splitkey, invalidate the segment-pointer. */
  1720. if( pPtr->pPg ){
  1721. res = sortedKeyCompare(pCsr->pDb->xCmp,
  1722. rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey,
  1723. pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey
  1724. );
  1725. if( res<0 ){
  1726. if( pPtr->eType & LSM_START_DELETE ){
  1727. pPtr->eType &= ~LSM_INSERT;
  1728. pPtr->pKey = pLvl->pSplitKey;
  1729. pPtr->nKey = pLvl->nSplitKey;
  1730. pPtr->pVal = 0;
  1731. pPtr->nVal = 0;
  1732. }else{
  1733. segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
  1734. }
  1735. }
  1736. }
  1737. if( aPtr[i].pKey ) bHit = 1;
  1738. }
  1739. if( rc==LSM_OK && eSeek==LSM_SEEK_LE && bHit==0 ){
  1740. rc = segmentPtrEnd(pCsr, &aPtr[0], 1);
  1741. }
  1742. }
  1743. assert( eSeek==LSM_SEEK_EQ || bStop==0 );
  1744. *piPgno = iOut;
  1745. *pbStop = bStop;
  1746. return rc;
  1747. }
  1748. static void multiCursorGetKey(
  1749. MultiCursor *pCsr,
  1750. int iKey,
  1751. int *peType, /* OUT: Key type (SORTED_WRITE etc.) */
  1752. void **ppKey, /* OUT: Pointer to buffer containing key */
  1753. int *pnKey /* OUT: Size of *ppKey in bytes */
  1754. ){
  1755. int nKey = 0;
  1756. void *pKey = 0;
  1757. int eType = 0;
  1758. switch( iKey ){
  1759. case CURSOR_DATA_TREE0:
  1760. case CURSOR_DATA_TREE1: {
  1761. TreeCursor *pTreeCsr = pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0];
  1762. if( lsmTreeCursorValid(pTreeCsr) ){
  1763. lsmTreeCursorKey(pTreeCsr, &eType, &pKey, &nKey);
  1764. }
  1765. break;
  1766. }
  1767. case CURSOR_DATA_SYSTEM: {
  1768. Snapshot *pWorker = pCsr->pDb->pWorker;
  1769. if( pWorker && (pCsr->flags & CURSOR_FLUSH_FREELIST) ){
  1770. int nEntry = pWorker->freelist.nEntry;
  1771. if( pCsr->iFree < (nEntry*2) ){
  1772. FreelistEntry *aEntry = pWorker->freelist.aEntry;
  1773. int i = nEntry - 1 - (pCsr->iFree / 2);
  1774. u32 iKey2 = 0;
  1775. if( (pCsr->iFree % 2) ){
  1776. eType = LSM_END_DELETE|LSM_SYSTEMKEY;
  1777. iKey2 = aEntry[i].iBlk-1;
  1778. }else if( aEntry[i].iId>=0 ){
  1779. eType = LSM_INSERT|LSM_SYSTEMKEY;
  1780. iKey2 = aEntry[i].iBlk;
  1781. /* If the in-memory entry immediately before this one was a
  1782. ** DELETE, and the block number is one greater than the current
  1783. ** block number, mark this entry as an "end-delete-range". */
  1784. if( i<(nEntry-1) && aEntry[i+1].iBlk==iKey2+1 && aEntry[i+1].iId<0 ){
  1785. eType |= LSM_END_DELETE;
  1786. }
  1787. }else{
  1788. eType = LSM_START_DELETE|LSM_SYSTEMKEY;
  1789. iKey2 = aEntry[i].iBlk + 1;
  1790. }
  1791. /* If the in-memory entry immediately after this one is a
  1792. ** DELETE, and the block number is one less than the current
  1793. ** key, mark this entry as an "start-delete-range". */
  1794. if( i>0 && aEntry[i-1].iBlk==iKey2-1 && aEntry[i-1].iId<0 ){
  1795. eType |= LSM_START_DELETE;
  1796. }
  1797. pKey = pCsr->pSystemVal;
  1798. nKey = 4;
  1799. lsmPutU32(pKey, ~iKey2);
  1800. }
  1801. }
  1802. break;
  1803. }
  1804. default: {
  1805. int iPtr = iKey - CURSOR_DATA_SEGMENT;
  1806. assert( iPtr>=0 );
  1807. if( iPtr==pCsr->nPtr ){
  1808. if( pCsr->pBtCsr ){
  1809. pKey = pCsr->pBtCsr->pKey;
  1810. nKey = pCsr->pBtCsr->nKey;
  1811. eType = pCsr->pBtCsr->eType;
  1812. }
  1813. }else if( iPtr<pCsr->nPtr ){
  1814. SegmentPtr *pPtr = &pCsr->aPtr[iPtr];
  1815. if( pPtr->pPg ){
  1816. pKey = pPtr->pKey;
  1817. nKey = pPtr->nKey;
  1818. eType = pPtr->eType;
  1819. }
  1820. }
  1821. break;
  1822. }
  1823. }
  1824. if( peType ) *peType = eType;
  1825. if( pnKey ) *pnKey = nKey;
  1826. if( ppKey ) *ppKey = pKey;
  1827. }
  1828. static int sortedDbKeyCompare(
  1829. MultiCursor *pCsr,
  1830. int iLhsFlags, void *pLhsKey, int nLhsKey,
  1831. int iRhsFlags, void *pRhsKey, int nRhsKey
  1832. ){
  1833. int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
  1834. int res;
  1835. /* Compare the keys, including the system flag. */
  1836. res = sortedKeyCompare(xCmp,
  1837. rtTopic(iLhsFlags), pLhsKey, nLhsKey,
  1838. rtTopic(iRhsFlags), pRhsKey, nRhsKey
  1839. );
  1840. /* If a key has the LSM_START_DELETE flag set, but not the LSM_INSERT or
  1841. ** LSM_POINT_DELETE flags, it is considered a delta larger. This prevents
  1842. ** the beginning of an open-ended set from masking a database entry or
  1843. ** delete at a lower level. */
  1844. if( res==0 && (pCsr->flags & CURSOR_IGNORE_DELETE) ){
  1845. const int m = LSM_POINT_DELETE|LSM_INSERT|LSM_END_DELETE |LSM_START_DELETE;
  1846. int iDel1 = 0;
  1847. int iDel2 = 0;
  1848. if( LSM_START_DELETE==(iLhsFlags & m) ) iDel1 = +1;
  1849. if( LSM_END_DELETE ==(iLhsFlags & m) ) iDel1 = -1;
  1850. if( LSM_START_DELETE==(iRhsFlags & m) ) iDel2 = +1;
  1851. if( LSM_END_DELETE ==(iRhsFlags & m) ) iDel2 = -1;
  1852. res = (iDel1 - iDel2);
  1853. }
  1854. return res;
  1855. }
  1856. static void multiCursorDoCompare(MultiCursor *pCsr, int iOut, int bReverse){
  1857. int i1;
  1858. int i2;
  1859. int iRes;
  1860. void *pKey1; int nKey1; int eType1;
  1861. void *pKey2; int nKey2; int eType2;
  1862. const int mul = (bReverse ? -1 : 1);
  1863. assert( pCsr->aTree && iOut<pCsr->nTree );
  1864. if( iOut>=(pCsr->nTree/2) ){
  1865. i1 = (iOut - pCsr->nTree/2) * 2;
  1866. i2 = i1 + 1;
  1867. }else{
  1868. i1 = pCsr->aTree[iOut*2];
  1869. i2 = pCsr->aTree[iOut*2+1];
  1870. }
  1871. multiCursorGetKey(pCsr, i1, &eType1, &pKey1, &nKey1);
  1872. multiCursorGetKey(pCsr, i2, &eType2, &pKey2, &nKey2);
  1873. if( pKey1==0 ){
  1874. iRes = i2;
  1875. }else if( pKey2==0 ){
  1876. iRes = i1;
  1877. }else{
  1878. int res;
  1879. /* Compare the keys */
  1880. res = sortedDbKeyCompare(pCsr,
  1881. eType1, pKey1, nKey1, eType2, pKey2, nKey2
  1882. );
  1883. res = res * mul;
  1884. if( res==0 ){
  1885. /* The two keys are identical. Normally, this means that the key from
  1886. ** the newer run clobbers the old. However, if the newer key is a
  1887. ** separator key, or a range-delete-boundary only, do not allow it
  1888. ** to clobber an older entry. */
  1889. int nc1 = (eType1 & (LSM_INSERT|LSM_POINT_DELETE))==0;
  1890. int nc2 = (eType2 & (LSM_INSERT|LSM_POINT_DELETE))==0;
  1891. iRes = (nc1 > nc2) ? i2 : i1;
  1892. }else if( res<0 ){
  1893. iRes = i1;
  1894. }else{
  1895. iRes = i2;
  1896. }
  1897. }
  1898. pCsr->aTree[iOut] = iRes;
  1899. }
  1900. /*
  1901. ** This function advances segment pointer iPtr belonging to multi-cursor
  1902. ** pCsr forward (bReverse==0) or backward (bReverse!=0).
  1903. **
  1904. ** If the segment pointer points to a segment that is part of a composite
  1905. ** level, then the following special case is handled.
  1906. **
  1907. ** * If iPtr is the lhs of a composite level, and the cursor is being
  1908. ** advanced forwards, and segment iPtr is at EOF, move all pointers
  1909. ** that correspond to rhs segments of the same level to the first
  1910. ** key in their respective data.
  1911. */
  1912. static int segmentCursorAdvance(
  1913. MultiCursor *pCsr,
  1914. int iPtr,
  1915. int bReverse
  1916. ){
  1917. int rc;
  1918. SegmentPtr *pPtr = &pCsr->aPtr[iPtr];
  1919. Level *pLvl = pPtr->pLevel;
  1920. int bComposite; /* True if pPtr is part of composite level */
  1921. /* Advance the segment-pointer object. */
  1922. rc = segmentPtrAdvance(pCsr, pPtr, bReverse);
  1923. if( rc!=LSM_OK ) return rc;
  1924. bComposite = (pLvl->nRight>0 && pCsr->nPtr>pLvl->nRight);
  1925. if( bComposite && pPtr->pPg==0 ){
  1926. int bFix = 0;
  1927. if( (bReverse==0)==(pPtr->pSeg==&pLvl->lhs) ){
  1928. int i;
  1929. if( bReverse ){
  1930. SegmentPtr *pLhs = &pCsr->aPtr[iPtr - 1 - (pPtr->pSeg - pLvl->aRhs)];
  1931. for(i=0; i<pLvl->nRight; i++){
  1932. if( pLhs[i+1].pPg ) break;
  1933. }
  1934. if( i==pLvl->nRight ){
  1935. bFix = 1;
  1936. rc = segmentPtrEnd(pCsr, pLhs, 1);
  1937. }
  1938. }else{
  1939. bFix = 1;
  1940. for(i=0; rc==LSM_OK && i<pLvl->nRight; i++){
  1941. rc = sortedRhsFirst(pCsr, pLvl, &pCsr->aPtr[iPtr+1+i]);
  1942. }
  1943. }
  1944. }
  1945. if( bFix ){
  1946. int i;
  1947. for(i=pCsr->nTree-1; i>0; i--){
  1948. multiCursorDoCompare(pCsr, i, bReverse);
  1949. }
  1950. }
  1951. }
  1952. #if 0
  1953. if( bComposite && pPtr->pSeg==&pLvl->lhs /* lhs of composite level */
  1954. && bReverse==0 /* csr advanced forwards */
  1955. && pPtr->pPg==0 /* segment at EOF */
  1956. ){
  1957. int i;
  1958. for(i=0; rc==LSM_OK && i<pLvl->nRight; i++){
  1959. rc = sortedRhsFirst(pCsr, pLvl, &pCsr->aPtr[iPtr+1+i]);
  1960. }
  1961. for(i=pCsr->nTree-1; i>0; i--){
  1962. multiCursorDoCompare(pCsr, i, 0);
  1963. }
  1964. }
  1965. #endif
  1966. return rc;
  1967. }
  1968. static void mcursorFreeComponents(MultiCursor *pCsr){
  1969. int i;
  1970. lsm_env *pEnv = pCsr->pDb->pEnv;
  1971. /* Close the tree cursor, if any. */
  1972. lsmTreeCursorDestroy(pCsr->apTreeCsr[0]);
  1973. lsmTreeCursorDestroy(pCsr->apTreeCsr[1]);
  1974. /* Reset the segment pointers */
  1975. for(i=0; i<pCsr->nPtr; i++){
  1976. segmentPtrReset(&pCsr->aPtr[i], 0);
  1977. }
  1978. /* And the b-tree cursor, if any */
  1979. btreeCursorFree(pCsr->pBtCsr);
  1980. /* Free allocations */
  1981. lsmFree(pEnv, pCsr->aPtr);
  1982. lsmFree(pEnv, pCsr->aTree);
  1983. lsmFree(pEnv, pCsr->pSystemVal);
  1984. /* Zero fields */
  1985. pCsr->nPtr = 0;
  1986. pCsr->aPtr = 0;
  1987. pCsr->nTree = 0;
  1988. pCsr->aTree = 0;
  1989. pCsr->pSystemVal = 0;
  1990. pCsr->apTreeCsr[0] = 0;
  1991. pCsr->apTreeCsr[1] = 0;
  1992. pCsr->pBtCsr = 0;
  1993. }
  1994. void lsmMCursorFreeCache(lsm_db *pDb){
  1995. MultiCursor *p;
  1996. MultiCursor *pNext;
  1997. for(p=pDb->pCsrCache; p; p=pNext){
  1998. pNext = p->pNext;
  1999. lsmMCursorClose(p, 0);
  2000. }
  2001. pDb->pCsrCache = 0;
  2002. }
  2003. /*
  2004. ** Close the cursor passed as the first argument.
  2005. **
  2006. ** If the bCache parameter is true, then shift the cursor to the pCsrCache
  2007. ** list for possible reuse instead of actually deleting it.
  2008. */
  2009. void lsmMCursorClose(MultiCursor *pCsr, int bCache){
  2010. if( pCsr ){
  2011. lsm_db *pDb = pCsr->pDb;
  2012. MultiCursor **pp; /* Iterator variable */
  2013. /* The cursor may or may not be currently part of the linked list
  2014. ** starting at lsm_db.pCsr. If it is, extract it. */
  2015. for(pp=&pDb->pCsr; *pp; pp=&((*pp)->pNext)){
  2016. if( *pp==pCsr ){
  2017. *pp = pCsr->pNext;
  2018. break;
  2019. }
  2020. }
  2021. if( bCache ){
  2022. int i; /* Used to iterate through segment-pointers */
  2023. /* Release any page references held by this cursor. */
  2024. assert( !pCsr->pBtCsr );
  2025. for(i=0; i<pCsr->nPtr; i++){
  2026. SegmentPtr *pPtr = &pCsr->aPtr[i];
  2027. lsmFsPageRelease(pPtr->pPg);
  2028. pPtr->pPg = 0;
  2029. }
  2030. /* Reset the tree cursors */
  2031. lsmTreeCursorReset(pCsr->apTreeCsr[0]);
  2032. lsmTreeCursorReset(pCsr->apTreeCsr[1]);
  2033. /* Add the cursor to the pCsrCache list */
  2034. pCsr->pNext = pDb->pCsrCache;
  2035. pDb->pCsrCache = pCsr;
  2036. }else{
  2037. /* Free the allocation used to cache the current key, if any. */
  2038. sortedBlobFree(&pCsr->key);
  2039. sortedBlobFree(&pCsr->val);
  2040. /* Free the component cursors */
  2041. mcursorFreeComponents(pCsr);
  2042. /* Free the cursor structure itself */
  2043. lsmFree(pDb->pEnv, pCsr);
  2044. }
  2045. }
  2046. }
  2047. #define TREE_NONE 0
  2048. #define TREE_OLD 1
  2049. #define TREE_BOTH 2
  2050. /*
  2051. ** Parameter eTree is one of TREE_OLD or TREE_BOTH.
  2052. */
  2053. static int multiCursorAddTree(MultiCursor *pCsr, Snapshot *pSnap, int eTree){
  2054. int rc = LSM_OK;
  2055. lsm_db *db = pCsr->pDb;
  2056. /* Add a tree cursor on the 'old' tree, if it exists. */
  2057. if( eTree!=TREE_NONE
  2058. && lsmTreeHasOld(db)
  2059. && db->treehdr.iOldLog!=pSnap->iLogOff
  2060. ){
  2061. rc = lsmTreeCursorNew(db, 1, &pCsr->apTreeCsr[1]);
  2062. }
  2063. /* Add a tree cursor on the 'current' tree, if required. */
  2064. if( rc==LSM_OK && eTree==TREE_BOTH ){
  2065. rc = lsmTreeCursorNew(db, 0, &pCsr->apTreeCsr[0]);
  2066. }
  2067. return rc;
  2068. }
  2069. static int multiCursorAddRhs(MultiCursor *pCsr, Level *pLvl){
  2070. int i;
  2071. int nRhs = pLvl->nRight;
  2072. assert( pLvl->nRight>0 );
  2073. assert( pCsr->aPtr==0 );
  2074. pCsr->aPtr = lsmMallocZero(pCsr->pDb->pEnv, sizeof(SegmentPtr) * nRhs);
  2075. if( !pCsr->aPtr ) return LSM_NOMEM_BKPT;
  2076. pCsr->nPtr = nRhs;
  2077. for(i=0; i<nRhs; i++){
  2078. pCsr->aPtr[i].pSeg = &pLvl->aRhs[i];
  2079. pCsr->aPtr[i].pLevel = pLvl;
  2080. }
  2081. return LSM_OK;
  2082. }
  2083. static void multiCursorAddOne(MultiCursor *pCsr, Level *pLvl, int *pRc){
  2084. if( *pRc==LSM_OK ){
  2085. int iPtr = pCsr->nPtr;
  2086. int i;
  2087. pCsr->aPtr[iPtr].pLevel = pLvl;
  2088. pCsr->aPtr[iPtr].pSeg = &pLvl->lhs;
  2089. iPtr++;
  2090. for(i=0; i<pLvl->nRight; i++){
  2091. pCsr->aPtr[iPtr].pLevel = pLvl;
  2092. pCsr->aPtr[iPtr].pSeg = &pLvl->aRhs[i];
  2093. iPtr++;
  2094. }
  2095. if( pLvl->nRight && pLvl->pSplitKey==0 ){
  2096. sortedSplitkey(pCsr->pDb, pLvl, pRc);
  2097. }
  2098. pCsr->nPtr = iPtr;
  2099. }
  2100. }
  2101. static int multiCursorAddAll(MultiCursor *pCsr, Snapshot *pSnap){
  2102. Level *pLvl;
  2103. int nPtr = 0;
  2104. int rc = LSM_OK;
  2105. for(pLvl=pSnap->pLevel; pLvl; pLvl=pLvl->pNext){
  2106. /* If the LEVEL_INCOMPLETE flag is set, then this function is being
  2107. ** called (indirectly) from within a sortedNewToplevel() call to
  2108. ** construct pLvl. In this case ignore pLvl - this cursor is going to
  2109. ** be used to retrieve a freelist entry from the LSM, and the partially
  2110. ** complete level may confuse it. */
  2111. if( pLvl->flags & LEVEL_INCOMPLETE ) continue;
  2112. nPtr += (1 + pLvl->nRight);
  2113. }
  2114. assert( pCsr->aPtr==0 );
  2115. pCsr->aPtr = lsmMallocZeroRc(pCsr->pDb->pEnv, sizeof(SegmentPtr) * nPtr, &rc);
  2116. for(pLvl=pSnap->pLevel; pLvl; pLvl=pLvl->pNext){
  2117. if( (pLvl->flags & LEVEL_INCOMPLETE)==0 ){
  2118. multiCursorAddOne(pCsr, pLvl, &rc);
  2119. }
  2120. }
  2121. return rc;
  2122. }
  2123. static int multiCursorInit(MultiCursor *pCsr, Snapshot *pSnap){
  2124. int rc;
  2125. rc = multiCursorAddAll(pCsr, pSnap);
  2126. if( rc==LSM_OK ){
  2127. rc = multiCursorAddTree(pCsr, pSnap, TREE_BOTH);
  2128. }
  2129. pCsr->flags |= (CURSOR_IGNORE_SYSTEM | CURSOR_IGNORE_DELETE);
  2130. return rc;
  2131. }
  2132. static MultiCursor *multiCursorNew(lsm_db *db, int *pRc){
  2133. MultiCursor *pCsr;
  2134. pCsr = (MultiCursor *)lsmMallocZeroRc(db->pEnv, sizeof(MultiCursor), pRc);
  2135. if( pCsr ){
  2136. pCsr->pNext = db->pCsr;
  2137. db->pCsr = pCsr;
  2138. pCsr->pDb = db;
  2139. }
  2140. return pCsr;
  2141. }
  2142. void lsmSortedRemap(lsm_db *pDb){
  2143. MultiCursor *pCsr;
  2144. for(pCsr=pDb->pCsr; pCsr; pCsr=pCsr->pNext){
  2145. int iPtr;
  2146. if( pCsr->pBtCsr ){
  2147. btreeCursorLoadKey(pCsr->pBtCsr);
  2148. }
  2149. for(iPtr=0; iPtr<pCsr->nPtr; iPtr++){
  2150. segmentPtrLoadCell(&pCsr->aPtr[iPtr], pCsr->aPtr[iPtr].iCell);
  2151. }
  2152. }
  2153. }
  2154. static void multiCursorReadSeparators(MultiCursor *pCsr){
  2155. if( pCsr->nPtr>0 ){
  2156. pCsr->flags |= CURSOR_READ_SEPARATORS;
  2157. }
  2158. }
  2159. /*
  2160. ** Have this cursor skip over SORTED_DELETE entries.
  2161. */
  2162. static void multiCursorIgnoreDelete(MultiCursor *pCsr){
  2163. if( pCsr ) pCsr->flags |= CURSOR_IGNORE_DELETE;
  2164. }
  2165. /*
  2166. ** If the free-block list is not empty, then have this cursor visit a key
  2167. ** with (a) the system bit set, and (b) the key "FREELIST" and (c) a value
  2168. ** blob containing the serialized free-block list.
  2169. */
  2170. static int multiCursorVisitFreelist(MultiCursor *pCsr){
  2171. int rc = LSM_OK;
  2172. pCsr->flags |= CURSOR_FLUSH_FREELIST;
  2173. pCsr->pSystemVal = lsmMallocRc(pCsr->pDb->pEnv, 4 + 8, &rc);
  2174. return rc;
  2175. }
  2176. /*
  2177. ** Allocate and return a new database cursor.
  2178. **
  2179. ** This method should only be called to allocate user cursors. As it may
  2180. ** recycle a cursor from lsm_db.pCsrCache.
  2181. */
  2182. int lsmMCursorNew(
  2183. lsm_db *pDb, /* Database handle */
  2184. MultiCursor **ppCsr /* OUT: Allocated cursor */
  2185. ){
  2186. MultiCursor *pCsr = 0;
  2187. int rc = LSM_OK;
  2188. if( pDb->pCsrCache ){
  2189. int bOld; /* True if there is an old in-memory tree */
  2190. /* Remove a cursor from the pCsrCache list and add it to the open list. */
  2191. pCsr = pDb->pCsrCache;
  2192. pDb->pCsrCache = pCsr->pNext;
  2193. pCsr->pNext = pDb->pCsr;
  2194. pDb->pCsr = pCsr;
  2195. /* The cursor can almost be used as is, except that the old in-memory
  2196. ** tree cursor may be present and not required, or required and not
  2197. ** present. Fix this if required. */
  2198. bOld = (lsmTreeHasOld(pDb) && pDb->treehdr.iOldLog!=pDb->pClient->iLogOff);
  2199. if( !bOld && pCsr->apTreeCsr[1] ){
  2200. lsmTreeCursorDestroy(pCsr->apTreeCsr[1]);
  2201. pCsr->apTreeCsr[1] = 0;
  2202. }else if( bOld && !pCsr->apTreeCsr[1] ){
  2203. rc = lsmTreeCursorNew(pDb, 1, &pCsr->apTreeCsr[1]);
  2204. }
  2205. pCsr->flags = (CURSOR_IGNORE_SYSTEM | CURSOR_IGNORE_DELETE);
  2206. }else{
  2207. pCsr = multiCursorNew(pDb, &rc);
  2208. if( rc==LSM_OK ) rc = multiCursorInit(pCsr, pDb->pClient);
  2209. }
  2210. if( rc!=LSM_OK ){
  2211. lsmMCursorClose(pCsr, 0);
  2212. pCsr = 0;
  2213. }
  2214. assert( (rc==LSM_OK)==(pCsr!=0) );
  2215. *ppCsr = pCsr;
  2216. return rc;
  2217. }
  2218. static int multiCursorGetVal(
  2219. MultiCursor *pCsr,
  2220. int iVal,
  2221. void **ppVal,
  2222. int *pnVal
  2223. ){
  2224. int rc = LSM_OK;
  2225. *ppVal = 0;
  2226. *pnVal = 0;
  2227. switch( iVal ){
  2228. case CURSOR_DATA_TREE0:
  2229. case CURSOR_DATA_TREE1: {
  2230. TreeCursor *pTreeCsr = pCsr->apTreeCsr[iVal-CURSOR_DATA_TREE0];
  2231. if( lsmTreeCursorValid(pTreeCsr) ){
  2232. lsmTreeCursorValue(pTreeCsr, ppVal, pnVal);
  2233. }else{
  2234. *ppVal = 0;
  2235. *pnVal = 0;
  2236. }
  2237. break;
  2238. }
  2239. case CURSOR_DATA_SYSTEM: {
  2240. Snapshot *pWorker = pCsr->pDb->pWorker;
  2241. if( pWorker
  2242. && (pCsr->iFree % 2)==0
  2243. && pCsr->iFree < (pWorker->freelist.nEntry*2)
  2244. ){
  2245. int iEntry = pWorker->freelist.nEntry - 1 - (pCsr->iFree / 2);
  2246. u8 *aVal = &((u8 *)(pCsr->pSystemVal))[4];
  2247. lsmPutU64(aVal, pWorker->freelist.aEntry[iEntry].iId);
  2248. *ppVal = aVal;
  2249. *pnVal = 8;
  2250. }
  2251. break;
  2252. }
  2253. default: {
  2254. int iPtr = iVal-CURSOR_DATA_SEGMENT;
  2255. if( iPtr<pCsr->nPtr ){
  2256. SegmentPtr *pPtr = &pCsr->aPtr[iPtr];
  2257. if( pPtr->pPg ){
  2258. *ppVal = pPtr->pVal;
  2259. *pnVal = pPtr->nVal;
  2260. }
  2261. }
  2262. }
  2263. }
  2264. assert( rc==LSM_OK || (*ppVal==0 && *pnVal==0) );
  2265. return rc;
  2266. }
  2267. static int multiCursorAdvance(MultiCursor *pCsr, int bReverse);
  2268. /*
  2269. ** This function is called by worker connections to walk the part of the
  2270. ** free-list stored within the LSM data structure.
  2271. */
  2272. int lsmSortedWalkFreelist(
  2273. lsm_db *pDb, /* Database handle */
  2274. int bReverse, /* True to iterate from largest to smallest */
  2275. int (*x)(void *, int, i64), /* Callback function */
  2276. void *pCtx /* First argument to pass to callback */
  2277. ){
  2278. MultiCursor *pCsr; /* Cursor used to read db */
  2279. int rc = LSM_OK; /* Return Code */
  2280. Snapshot *pSnap = 0;
  2281. assert( pDb->pWorker );
  2282. if( pDb->bIncrMerge ){
  2283. rc = lsmCheckpointDeserialize(pDb, 0, pDb->pShmhdr->aSnap1, &pSnap);
  2284. if( rc!=LSM_OK ) return rc;
  2285. }else{
  2286. pSnap = pDb->pWorker;
  2287. }
  2288. pCsr = multiCursorNew(pDb, &rc);
  2289. if( pCsr ){
  2290. rc = multiCursorAddAll(pCsr, pSnap);
  2291. pCsr->flags |= CURSOR_IGNORE_DELETE;
  2292. }
  2293. if( rc==LSM_OK ){
  2294. if( bReverse==0 ){
  2295. rc = lsmMCursorLast(pCsr);
  2296. }else{
  2297. rc = lsmMCursorSeek(pCsr, 1, "", 0, LSM_SEEK_GE);
  2298. }
  2299. while( rc==LSM_OK && lsmMCursorValid(pCsr) && rtIsSystem(pCsr->eType) ){
  2300. void *pKey; int nKey;
  2301. void *pVal = 0; int nVal = 0;
  2302. rc = lsmMCursorKey(pCsr, &pKey, &nKey);
  2303. if( rc==LSM_OK ) rc = lsmMCursorValue(pCsr, &pVal, &nVal);
  2304. if( rc==LSM_OK && (nKey!=4 || nVal!=8) ) rc = LSM_CORRUPT_BKPT;
  2305. if( rc==LSM_OK ){
  2306. int iBlk;
  2307. i64 iSnap;
  2308. iBlk = (int)(~(lsmGetU32((u8 *)pKey)));
  2309. iSnap = (i64)lsmGetU64((u8 *)pVal);
  2310. if( x(pCtx, iBlk, iSnap) ) break;
  2311. rc = multiCursorAdvance(pCsr, !bReverse);
  2312. }
  2313. }
  2314. }
  2315. lsmMCursorClose(pCsr, 0);
  2316. if( pSnap!=pDb->pWorker ){
  2317. lsmFreeSnapshot(pDb->pEnv, pSnap);
  2318. }
  2319. return rc;
  2320. }
  2321. int lsmSortedLoadFreelist(
  2322. lsm_db *pDb, /* Database handle (must be worker) */
  2323. void **ppVal, /* OUT: Blob containing LSM free-list */
  2324. int *pnVal /* OUT: Size of *ppVal blob in bytes */
  2325. ){
  2326. MultiCursor *pCsr; /* Cursor used to retreive free-list */
  2327. int rc = LSM_OK; /* Return Code */
  2328. assert( pDb->pWorker );
  2329. assert( *ppVal==0 && *pnVal==0 );
  2330. pCsr = multiCursorNew(pDb, &rc);
  2331. if( pCsr ){
  2332. rc = multiCursorAddAll(pCsr, pDb->pWorker);
  2333. pCsr->flags |= CURSOR_IGNORE_DELETE;
  2334. }
  2335. if( rc==LSM_OK ){
  2336. rc = lsmMCursorLast(pCsr);
  2337. if( rc==LSM_OK
  2338. && rtIsWrite(pCsr->eType) && rtIsSystem(pCsr->eType)
  2339. && pCsr->key.nData==8
  2340. && 0==memcmp(pCsr->key.pData, "FREELIST", 8)
  2341. ){
  2342. void *pVal; int nVal; /* Value read from database */
  2343. rc = lsmMCursorValue(pCsr, &pVal, &nVal);
  2344. if( rc==LSM_OK ){
  2345. *ppVal = lsmMallocRc(pDb->pEnv, nVal, &rc);
  2346. if( *ppVal ){
  2347. memcpy(*ppVal, pVal, nVal);
  2348. *pnVal = nVal;
  2349. }
  2350. }
  2351. }
  2352. lsmMCursorClose(pCsr, 0);
  2353. }
  2354. return rc;
  2355. }
  2356. static int multiCursorAllocTree(MultiCursor *pCsr){
  2357. int rc = LSM_OK;
  2358. if( pCsr->aTree==0 ){
  2359. int nByte; /* Bytes of space to allocate */
  2360. int nMin; /* Total number of cursors being merged */
  2361. nMin = CURSOR_DATA_SEGMENT + pCsr->nPtr + (pCsr->pBtCsr!=0);
  2362. pCsr->nTree = 2;
  2363. while( pCsr->nTree<nMin ){
  2364. pCsr->nTree = pCsr->nTree*2;
  2365. }
  2366. nByte = sizeof(int)*pCsr->nTree*2;
  2367. pCsr->aTree = (int *)lsmMallocZeroRc(pCsr->pDb->pEnv, nByte, &rc);
  2368. }
  2369. return rc;
  2370. }
  2371. static void multiCursorCacheKey(MultiCursor *pCsr, int *pRc){
  2372. if( *pRc==LSM_OK ){
  2373. void *pKey;
  2374. int nKey;
  2375. multiCursorGetKey(pCsr, pCsr->aTree[1], &pCsr->eType, &pKey, &nKey);
  2376. *pRc = sortedBlobSet(pCsr->pDb->pEnv, &pCsr->key, pKey, nKey);
  2377. }
  2378. }
  2379. #ifdef LSM_DEBUG_EXPENSIVE
  2380. static void assertCursorTree(MultiCursor *pCsr){
  2381. int bRev = !!(pCsr->flags & CURSOR_PREV_OK);
  2382. int *aSave = pCsr->aTree;
  2383. int nSave = pCsr->nTree;
  2384. int rc;
  2385. pCsr->aTree = 0;
  2386. pCsr->nTree = 0;
  2387. rc = multiCursorAllocTree(pCsr);
  2388. if( rc==LSM_OK ){
  2389. int i;
  2390. for(i=pCsr->nTree-1; i>0; i--){
  2391. multiCursorDoCompare(pCsr, i, bRev);
  2392. }
  2393. assert( nSave==pCsr->nTree
  2394. && 0==memcmp(aSave, pCsr->aTree, sizeof(int)*nSave)
  2395. );
  2396. lsmFree(pCsr->pDb->pEnv, pCsr->aTree);
  2397. }
  2398. pCsr->aTree = aSave;
  2399. pCsr->nTree = nSave;
  2400. }
  2401. #else
  2402. # define assertCursorTree(x)
  2403. #endif
  2404. static int mcursorLocationOk(MultiCursor *pCsr, int bDeleteOk){
  2405. int eType = pCsr->eType;
  2406. int iKey;
  2407. int i;
  2408. int rdmask;
  2409. assert( pCsr->flags & (CURSOR_NEXT_OK|CURSOR_PREV_OK) );
  2410. assertCursorTree(pCsr);
  2411. rdmask = (pCsr->flags & CURSOR_NEXT_OK) ? LSM_END_DELETE : LSM_START_DELETE;
  2412. /* If the cursor does not currently point to an actual database key (i.e.
  2413. ** it points to a delete key, or the start or end of a range-delete), and
  2414. ** the CURSOR_IGNORE_DELETE flag is set, skip past this entry. */
  2415. if( (pCsr->flags & CURSOR_IGNORE_DELETE) && bDeleteOk==0 ){
  2416. if( (eType & LSM_INSERT)==0 ) return 0;
  2417. }
  2418. /* If the cursor points to a system key (free-list entry), and the
  2419. ** CURSOR_IGNORE_SYSTEM flag is set, skip thie entry. */
  2420. if( (pCsr->flags & CURSOR_IGNORE_SYSTEM) && rtTopic(eType)!=0 ){
  2421. return 0;
  2422. }
  2423. #ifndef NDEBUG
  2424. /* This block fires assert() statements to check one of the assumptions
  2425. ** in the comment below - that if the lhs sub-cursor of a level undergoing
  2426. ** a merge is valid, then all the rhs sub-cursors must be at EOF.
  2427. **
  2428. ** Also assert that all rhs sub-cursors are either at EOF or point to
  2429. ** a key that is not less than the level split-key. */
  2430. for(i=0; i<pCsr->nPtr; i++){
  2431. SegmentPtr *pPtr = &pCsr->aPtr[i];
  2432. Level *pLvl = pPtr->pLevel;
  2433. if( pLvl->nRight && pPtr->pPg ){
  2434. if( pPtr->pSeg==&pLvl->lhs ){
  2435. int j;
  2436. for(j=0; j<pLvl->nRight; j++) assert( pPtr[j+1].pPg==0 );
  2437. }else{
  2438. int res = sortedKeyCompare(pCsr->pDb->xCmp,
  2439. rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey,
  2440. pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey
  2441. );
  2442. assert( res>=0 );
  2443. }
  2444. }
  2445. }
  2446. #endif
  2447. /* Now check if this key has already been deleted by a range-delete. If
  2448. ** so, skip past it.
  2449. **
  2450. ** Assume, for the moment, that the tree contains no levels currently
  2451. ** undergoing incremental merge, and that this cursor is iterating forwards
  2452. ** through the database keys. The cursor currently points to a key in
  2453. ** level L. This key has already been deleted if any of the sub-cursors
  2454. ** that point to levels newer than L (or to the in-memory tree) point to
  2455. ** a key greater than the current key with the LSM_END_DELETE flag set.
  2456. **
  2457. ** Or, if the cursor is iterating backwards through data keys, if any
  2458. ** such sub-cursor points to a key smaller than the current key with the
  2459. ** LSM_START_DELETE flag set.
  2460. **
  2461. ** Why it works with levels undergoing a merge too:
  2462. **
  2463. ** When a cursor iterates forwards, the sub-cursors for the rhs of a
  2464. ** level are only activated once the lhs reaches EOF. So when iterating
  2465. ** forwards, the keys visited are the same as if the level was completely
  2466. ** merged.
  2467. **
  2468. ** If the cursor is iterating backwards, then the lhs sub-cursor is not
  2469. ** initialized until the last of the rhs sub-cursors has reached EOF.
  2470. ** Additionally, if the START_DELETE flag is set on the last entry (in
  2471. ** reverse order - so the entry with the smallest key) of a rhs sub-cursor,
  2472. ** then a pseudo-key equal to the levels split-key with the END_DELETE
  2473. ** flag set is visited by the sub-cursor.
  2474. */
  2475. iKey = pCsr->aTree[1];
  2476. for(i=0; i<iKey; i++){
  2477. int csrflags;
  2478. multiCursorGetKey(pCsr, i, &csrflags, 0, 0);
  2479. if( (rdmask & csrflags) ){
  2480. const int SD_ED = (LSM_START_DELETE|LSM_END_DELETE);
  2481. if( (csrflags & SD_ED)==SD_ED
  2482. || (pCsr->flags & CURSOR_IGNORE_DELETE)==0
  2483. ){
  2484. void *pKey; int nKey;
  2485. multiCursorGetKey(pCsr, i, 0, &pKey, &nKey);
  2486. if( 0==sortedKeyCompare(pCsr->pDb->xCmp,
  2487. rtTopic(eType), pCsr->key.pData, pCsr->key.nData,
  2488. rtTopic(csrflags), pKey, nKey
  2489. )){
  2490. continue;
  2491. }
  2492. }
  2493. return 0;
  2494. }
  2495. }
  2496. /* The current cursor position is one this cursor should visit. Return 1. */
  2497. return 1;
  2498. }
  2499. static int multiCursorSetupTree(MultiCursor *pCsr, int bRev){
  2500. int rc;
  2501. rc = multiCursorAllocTree(pCsr);
  2502. if( rc==LSM_OK ){
  2503. int i;
  2504. for(i=pCsr->nTree-1; i>0; i--){
  2505. multiCursorDoCompare(pCsr, i, bRev);
  2506. }
  2507. }
  2508. assertCursorTree(pCsr);
  2509. multiCursorCacheKey(pCsr, &rc);
  2510. if( rc==LSM_OK && mcursorLocationOk(pCsr, 0)==0 ){
  2511. rc = multiCursorAdvance(pCsr, bRev);
  2512. }
  2513. return rc;
  2514. }
  2515. static int multiCursorEnd(MultiCursor *pCsr, int bLast){
  2516. int rc = LSM_OK;
  2517. int i;
  2518. pCsr->flags &= ~(CURSOR_NEXT_OK | CURSOR_PREV_OK | CURSOR_SEEK_EQ);
  2519. pCsr->flags |= (bLast ? CURSOR_PREV_OK : CURSOR_NEXT_OK);
  2520. pCsr->iFree = 0;
  2521. /* Position the two in-memory tree cursors */
  2522. for(i=0; rc==LSM_OK && i<2; i++){
  2523. if( pCsr->apTreeCsr[i] ){
  2524. rc = lsmTreeCursorEnd(pCsr->apTreeCsr[i], bLast);
  2525. }
  2526. }
  2527. for(i=0; rc==LSM_OK && i<pCsr->nPtr; i++){
  2528. SegmentPtr *pPtr = &pCsr->aPtr[i];
  2529. Level *pLvl = pPtr->pLevel;
  2530. int iRhs;
  2531. int bHit = 0;
  2532. if( bLast ){
  2533. for(iRhs=0; iRhs<pLvl->nRight && rc==LSM_OK; iRhs++){
  2534. rc = segmentPtrEnd(pCsr, &pPtr[iRhs+1], 1);
  2535. if( pPtr[iRhs+1].pPg ) bHit = 1;
  2536. }
  2537. if( bHit==0 && rc==LSM_OK ){
  2538. rc = segmentPtrEnd(pCsr, pPtr, 1);
  2539. }else{
  2540. segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
  2541. }
  2542. }else{
  2543. int bLhs = (pPtr->pSeg==&pLvl->lhs);
  2544. assert( pPtr->pSeg==&pLvl->lhs || pPtr->pSeg==&pLvl->aRhs[0] );
  2545. if( bLhs ){
  2546. rc = segmentPtrEnd(pCsr, pPtr, 0);
  2547. if( pPtr->pKey ) bHit = 1;
  2548. }
  2549. for(iRhs=0; iRhs<pLvl->nRight && rc==LSM_OK; iRhs++){
  2550. if( bHit ){
  2551. segmentPtrReset(&pPtr[iRhs+1], LSM_SEGMENTPTR_FREE_THRESHOLD);
  2552. }else{
  2553. rc = sortedRhsFirst(pCsr, pLvl, &pPtr[iRhs+bLhs]);
  2554. }
  2555. }
  2556. }
  2557. i += pLvl->nRight;
  2558. }
  2559. /* And the b-tree cursor, if applicable */
  2560. if( rc==LSM_OK && pCsr->pBtCsr ){
  2561. assert( bLast==0 );
  2562. rc = btreeCursorFirst(pCsr->pBtCsr);
  2563. }
  2564. if( rc==LSM_OK ){
  2565. rc = multiCursorSetupTree(pCsr, bLast);
  2566. }
  2567. return rc;
  2568. }
  2569. int mcursorSave(MultiCursor *pCsr){
  2570. int rc = LSM_OK;
  2571. if( pCsr->aTree ){
  2572. int iTree = pCsr->aTree[1];
  2573. if( iTree==CURSOR_DATA_TREE0 || iTree==CURSOR_DATA_TREE1 ){
  2574. multiCursorCacheKey(pCsr, &rc);
  2575. }
  2576. }
  2577. mcursorFreeComponents(pCsr);
  2578. return rc;
  2579. }
  2580. int mcursorRestore(lsm_db *pDb, MultiCursor *pCsr){
  2581. int rc;
  2582. rc = multiCursorInit(pCsr, pDb->pClient);
  2583. if( rc==LSM_OK && pCsr->key.pData ){
  2584. rc = lsmMCursorSeek(pCsr,
  2585. rtTopic(pCsr->eType), pCsr->key.pData, pCsr->key.nData, +1
  2586. );
  2587. }
  2588. return rc;
  2589. }
  2590. int lsmSaveCursors(lsm_db *pDb){
  2591. int rc = LSM_OK;
  2592. MultiCursor *pCsr;
  2593. for(pCsr=pDb->pCsr; rc==LSM_OK && pCsr; pCsr=pCsr->pNext){
  2594. rc = mcursorSave(pCsr);
  2595. }
  2596. return rc;
  2597. }
  2598. int lsmRestoreCursors(lsm_db *pDb){
  2599. int rc = LSM_OK;
  2600. MultiCursor *pCsr;
  2601. for(pCsr=pDb->pCsr; rc==LSM_OK && pCsr; pCsr=pCsr->pNext){
  2602. rc = mcursorRestore(pDb, pCsr);
  2603. }
  2604. return rc;
  2605. }
  2606. int lsmMCursorFirst(MultiCursor *pCsr){
  2607. return multiCursorEnd(pCsr, 0);
  2608. }
  2609. int lsmMCursorLast(MultiCursor *pCsr){
  2610. return multiCursorEnd(pCsr, 1);
  2611. }
  2612. lsm_db *lsmMCursorDb(MultiCursor *pCsr){
  2613. return pCsr->pDb;
  2614. }
  2615. void lsmMCursorReset(MultiCursor *pCsr){
  2616. int i;
  2617. lsmTreeCursorReset(pCsr->apTreeCsr[0]);
  2618. lsmTreeCursorReset(pCsr->apTreeCsr[1]);
  2619. for(i=0; i<pCsr->nPtr; i++){
  2620. segmentPtrReset(&pCsr->aPtr[i], LSM_SEGMENTPTR_FREE_THRESHOLD);
  2621. }
  2622. pCsr->key.nData = 0;
  2623. }
  2624. static int treeCursorSeek(
  2625. MultiCursor *pCsr,
  2626. TreeCursor *pTreeCsr,
  2627. void *pKey, int nKey,
  2628. int eSeek,
  2629. int *pbStop
  2630. ){
  2631. int rc = LSM_OK;
  2632. if( pTreeCsr ){
  2633. int res = 0;
  2634. lsmTreeCursorSeek(pTreeCsr, pKey, nKey, &res);
  2635. switch( eSeek ){
  2636. case LSM_SEEK_EQ: {
  2637. int eType = lsmTreeCursorFlags(pTreeCsr);
  2638. if( (res<0 && (eType & LSM_START_DELETE))
  2639. || (res>0 && (eType & LSM_END_DELETE))
  2640. || (res==0 && (eType & LSM_POINT_DELETE))
  2641. ){
  2642. *pbStop = 1;
  2643. }else if( res==0 && (eType & LSM_INSERT) ){
  2644. lsm_env *pEnv = pCsr->pDb->pEnv;
  2645. void *p; int n; /* Key/value from tree-cursor */
  2646. *pbStop = 1;
  2647. pCsr->flags |= CURSOR_SEEK_EQ;
  2648. rc = lsmTreeCursorKey(pTreeCsr, &pCsr->eType, &p, &n);
  2649. if( rc==LSM_OK ) rc = sortedBlobSet(pEnv, &pCsr->key, p, n);
  2650. if( rc==LSM_OK ) rc = lsmTreeCursorValue(pTreeCsr, &p, &n);
  2651. if( rc==LSM_OK ) rc = sortedBlobSet(pEnv, &pCsr->val, p, n);
  2652. }
  2653. lsmTreeCursorReset(pTreeCsr);
  2654. break;
  2655. }
  2656. case LSM_SEEK_GE:
  2657. if( res<0 && lsmTreeCursorValid(pTreeCsr) ){
  2658. lsmTreeCursorNext(pTreeCsr);
  2659. }
  2660. break;
  2661. default:
  2662. if( res>0 ){
  2663. assert( lsmTreeCursorValid(pTreeCsr) );
  2664. lsmTreeCursorPrev(pTreeCsr);
  2665. }
  2666. break;
  2667. }
  2668. }
  2669. return rc;
  2670. }
  2671. /*
  2672. ** Seek the cursor.
  2673. */
  2674. int lsmMCursorSeek(
  2675. MultiCursor *pCsr,
  2676. int iTopic,
  2677. void *pKey, int nKey,
  2678. int eSeek
  2679. ){
  2680. int eESeek = eSeek; /* Effective eSeek parameter */
  2681. int bStop = 0; /* Set to true to halt search operation */
  2682. int rc = LSM_OK; /* Return code */
  2683. int iPtr = 0; /* Used to iterate through pCsr->aPtr[] */
  2684. LsmPgno iPgno = 0; /* FC pointer value */
  2685. assert( pCsr->apTreeCsr[0]==0 || iTopic==0 );
  2686. assert( pCsr->apTreeCsr[1]==0 || iTopic==0 );
  2687. if( eESeek==LSM_SEEK_LEFAST ) eESeek = LSM_SEEK_LE;
  2688. assert( eESeek==LSM_SEEK_EQ || eESeek==LSM_SEEK_LE || eESeek==LSM_SEEK_GE );
  2689. assert( (pCsr->flags & CURSOR_FLUSH_FREELIST)==0 );
  2690. assert( pCsr->nPtr==0 || pCsr->aPtr[0].pLevel );
  2691. pCsr->flags &= ~(CURSOR_NEXT_OK | CURSOR_PREV_OK | CURSOR_SEEK_EQ);
  2692. rc = treeCursorSeek(pCsr, pCsr->apTreeCsr[0], pKey, nKey, eESeek, &bStop);
  2693. if( rc==LSM_OK && bStop==0 ){
  2694. rc = treeCursorSeek(pCsr, pCsr->apTreeCsr[1], pKey, nKey, eESeek, &bStop);
  2695. }
  2696. /* Seek all segment pointers. */
  2697. for(iPtr=0; iPtr<pCsr->nPtr && rc==LSM_OK && bStop==0; iPtr++){
  2698. SegmentPtr *pPtr = &pCsr->aPtr[iPtr];
  2699. assert( pPtr->pSeg==&pPtr->pLevel->lhs );
  2700. rc = seekInLevel(pCsr, pPtr, eESeek, iTopic, pKey, nKey, &iPgno, &bStop);
  2701. iPtr += pPtr->pLevel->nRight;
  2702. }
  2703. if( eSeek!=LSM_SEEK_EQ ){
  2704. if( rc==LSM_OK ){
  2705. rc = multiCursorAllocTree(pCsr);
  2706. }
  2707. if( rc==LSM_OK ){
  2708. int i;
  2709. for(i=pCsr->nTree-1; i>0; i--){
  2710. multiCursorDoCompare(pCsr, i, eESeek==LSM_SEEK_LE);
  2711. }
  2712. if( eSeek==LSM_SEEK_GE ) pCsr->flags |= CURSOR_NEXT_OK;
  2713. if( eSeek==LSM_SEEK_LE ) pCsr->flags |= CURSOR_PREV_OK;
  2714. }
  2715. multiCursorCacheKey(pCsr, &rc);
  2716. if( rc==LSM_OK && eSeek!=LSM_SEEK_LEFAST && 0==mcursorLocationOk(pCsr, 0) ){
  2717. switch( eESeek ){
  2718. case LSM_SEEK_EQ:
  2719. lsmMCursorReset(pCsr);
  2720. break;
  2721. case LSM_SEEK_GE:
  2722. rc = lsmMCursorNext(pCsr);
  2723. break;
  2724. default:
  2725. rc = lsmMCursorPrev(pCsr);
  2726. break;
  2727. }
  2728. }
  2729. }
  2730. return rc;
  2731. }
  2732. int lsmMCursorValid(MultiCursor *pCsr){
  2733. int res = 0;
  2734. if( pCsr->flags & CURSOR_SEEK_EQ ){
  2735. res = 1;
  2736. }else if( pCsr->aTree ){
  2737. int iKey = pCsr->aTree[1];
  2738. if( iKey==CURSOR_DATA_TREE0 || iKey==CURSOR_DATA_TREE1 ){
  2739. res = lsmTreeCursorValid(pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0]);
  2740. }else{
  2741. void *pKey;
  2742. multiCursorGetKey(pCsr, iKey, 0, &pKey, 0);
  2743. res = pKey!=0;
  2744. }
  2745. }
  2746. return res;
  2747. }
  2748. static int mcursorAdvanceOk(
  2749. MultiCursor *pCsr,
  2750. int bReverse,
  2751. int *pRc
  2752. ){
  2753. void *pNew; /* Pointer to buffer containing new key */
  2754. int nNew; /* Size of buffer pNew in bytes */
  2755. int eNewType; /* Type of new record */
  2756. if( *pRc ) return 1;
  2757. /* Check the current key value. If it is not greater than (if bReverse==0)
  2758. ** or less than (if bReverse!=0) the key currently cached in pCsr->key,
  2759. ** then the cursor has not yet been successfully advanced.
  2760. */
  2761. multiCursorGetKey(pCsr, pCsr->aTree[1], &eNewType, &pNew, &nNew);
  2762. if( pNew ){
  2763. int typemask = (pCsr->flags & CURSOR_IGNORE_DELETE) ? ~(0) : LSM_SYSTEMKEY;
  2764. int res = sortedDbKeyCompare(pCsr,
  2765. eNewType & typemask, pNew, nNew,
  2766. pCsr->eType & typemask, pCsr->key.pData, pCsr->key.nData
  2767. );
  2768. if( (bReverse==0 && res<=0) || (bReverse!=0 && res>=0) ){
  2769. return 0;
  2770. }
  2771. multiCursorCacheKey(pCsr, pRc);
  2772. assert( pCsr->eType==eNewType );
  2773. /* If this cursor is configured to skip deleted keys, and the current
  2774. ** cursor points to a SORTED_DELETE entry, then the cursor has not been
  2775. ** successfully advanced.
  2776. **
  2777. ** Similarly, if the cursor is configured to skip system keys and the
  2778. ** current cursor points to a system key, it has not yet been advanced.
  2779. */
  2780. if( *pRc==LSM_OK && 0==mcursorLocationOk(pCsr, 0) ) return 0;
  2781. }
  2782. return 1;
  2783. }
  2784. static void flCsrAdvance(MultiCursor *pCsr){
  2785. assert( pCsr->flags & CURSOR_FLUSH_FREELIST );
  2786. if( pCsr->iFree % 2 ){
  2787. pCsr->iFree++;
  2788. }else{
  2789. int nEntry = pCsr->pDb->pWorker->freelist.nEntry;
  2790. FreelistEntry *aEntry = pCsr->pDb->pWorker->freelist.aEntry;
  2791. int i = nEntry - 1 - (pCsr->iFree / 2);
  2792. /* If the current entry is a delete and the "end-delete" key will not
  2793. ** be attached to the next entry, increment iFree by 1 only. */
  2794. if( aEntry[i].iId<0 ){
  2795. while( 1 ){
  2796. if( i==0 || aEntry[i-1].iBlk!=aEntry[i].iBlk-1 ){
  2797. pCsr->iFree--;
  2798. break;
  2799. }
  2800. if( aEntry[i-1].iId>=0 ) break;
  2801. pCsr->iFree += 2;
  2802. i--;
  2803. }
  2804. }
  2805. pCsr->iFree += 2;
  2806. }
  2807. }
  2808. static int multiCursorAdvance(MultiCursor *pCsr, int bReverse){
  2809. int rc = LSM_OK; /* Return Code */
  2810. if( lsmMCursorValid(pCsr) ){
  2811. do {
  2812. int iKey = pCsr->aTree[1];
  2813. assertCursorTree(pCsr);
  2814. /* If this multi-cursor is advancing forwards, and the sub-cursor
  2815. ** being advanced is the one that separator keys may be being read
  2816. ** from, record the current absolute pointer value. */
  2817. if( pCsr->pPrevMergePtr ){
  2818. if( iKey==(CURSOR_DATA_SEGMENT+pCsr->nPtr) ){
  2819. assert( pCsr->pBtCsr );
  2820. *pCsr->pPrevMergePtr = pCsr->pBtCsr->iPtr;
  2821. }else if( pCsr->pBtCsr==0 && pCsr->nPtr>0
  2822. && iKey==(CURSOR_DATA_SEGMENT+pCsr->nPtr-1)
  2823. ){
  2824. SegmentPtr *pPtr = &pCsr->aPtr[iKey-CURSOR_DATA_SEGMENT];
  2825. *pCsr->pPrevMergePtr = pPtr->iPtr+pPtr->iPgPtr;
  2826. }
  2827. }
  2828. if( iKey==CURSOR_DATA_TREE0 || iKey==CURSOR_DATA_TREE1 ){
  2829. TreeCursor *pTreeCsr = pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0];
  2830. if( bReverse ){
  2831. rc = lsmTreeCursorPrev(pTreeCsr);
  2832. }else{
  2833. rc = lsmTreeCursorNext(pTreeCsr);
  2834. }
  2835. }else if( iKey==CURSOR_DATA_SYSTEM ){
  2836. assert( pCsr->flags & CURSOR_FLUSH_FREELIST );
  2837. assert( bReverse==0 );
  2838. flCsrAdvance(pCsr);
  2839. }else if( iKey==(CURSOR_DATA_SEGMENT+pCsr->nPtr) ){
  2840. assert( bReverse==0 && pCsr->pBtCsr );
  2841. rc = btreeCursorNext(pCsr->pBtCsr);
  2842. }else{
  2843. rc = segmentCursorAdvance(pCsr, iKey-CURSOR_DATA_SEGMENT, bReverse);
  2844. }
  2845. if( rc==LSM_OK ){
  2846. int i;
  2847. for(i=(iKey+pCsr->nTree)/2; i>0; i=i/2){
  2848. multiCursorDoCompare(pCsr, i, bReverse);
  2849. }
  2850. assertCursorTree(pCsr);
  2851. }
  2852. }while( mcursorAdvanceOk(pCsr, bReverse, &rc)==0 );
  2853. }
  2854. return rc;
  2855. }
  2856. int lsmMCursorNext(MultiCursor *pCsr){
  2857. if( (pCsr->flags & CURSOR_NEXT_OK)==0 ) return LSM_MISUSE_BKPT;
  2858. return multiCursorAdvance(pCsr, 0);
  2859. }
  2860. int lsmMCursorPrev(MultiCursor *pCsr){
  2861. if( (pCsr->flags & CURSOR_PREV_OK)==0 ) return LSM_MISUSE_BKPT;
  2862. return multiCursorAdvance(pCsr, 1);
  2863. }
  2864. int lsmMCursorKey(MultiCursor *pCsr, void **ppKey, int *pnKey){
  2865. if( (pCsr->flags & CURSOR_SEEK_EQ) || pCsr->aTree==0 ){
  2866. *pnKey = pCsr->key.nData;
  2867. *ppKey = pCsr->key.pData;
  2868. }else{
  2869. int iKey = pCsr->aTree[1];
  2870. if( iKey==CURSOR_DATA_TREE0 || iKey==CURSOR_DATA_TREE1 ){
  2871. TreeCursor *pTreeCsr = pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0];
  2872. lsmTreeCursorKey(pTreeCsr, 0, ppKey, pnKey);
  2873. }else{
  2874. int nKey;
  2875. #ifndef NDEBUG
  2876. void *pKey;
  2877. int eType;
  2878. multiCursorGetKey(pCsr, iKey, &eType, &pKey, &nKey);
  2879. assert( eType==pCsr->eType );
  2880. assert( nKey==pCsr->key.nData );
  2881. assert( memcmp(pKey, pCsr->key.pData, nKey)==0 );
  2882. #endif
  2883. nKey = pCsr->key.nData;
  2884. if( nKey==0 ){
  2885. *ppKey = 0;
  2886. }else{
  2887. *ppKey = pCsr->key.pData;
  2888. }
  2889. *pnKey = nKey;
  2890. }
  2891. }
  2892. return LSM_OK;
  2893. }
  2894. /*
  2895. ** Compare the current key that cursor csr points to with pKey/nKey. Set
  2896. ** *piRes to the result and return LSM_OK.
  2897. */
  2898. int lsm_csr_cmp(lsm_cursor *csr, const void *pKey, int nKey, int *piRes){
  2899. MultiCursor *pCsr = (MultiCursor *)csr;
  2900. void *pCsrkey; int nCsrkey;
  2901. int rc;
  2902. rc = lsmMCursorKey(pCsr, &pCsrkey, &nCsrkey);
  2903. if( rc==LSM_OK ){
  2904. int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
  2905. *piRes = sortedKeyCompare(xCmp, 0, pCsrkey, nCsrkey, 0, (void *)pKey, nKey);
  2906. }
  2907. return rc;
  2908. }
  2909. int lsmMCursorValue(MultiCursor *pCsr, void **ppVal, int *pnVal){
  2910. void *pVal;
  2911. int nVal;
  2912. int rc;
  2913. if( (pCsr->flags & CURSOR_SEEK_EQ) || pCsr->aTree==0 ){
  2914. rc = LSM_OK;
  2915. nVal = pCsr->val.nData;
  2916. pVal = pCsr->val.pData;
  2917. }else{
  2918. assert( pCsr->aTree );
  2919. assert( mcursorLocationOk(pCsr, (pCsr->flags & CURSOR_IGNORE_DELETE)) );
  2920. rc = multiCursorGetVal(pCsr, pCsr->aTree[1], &pVal, &nVal);
  2921. if( pVal && rc==LSM_OK ){
  2922. rc = sortedBlobSet(pCsr->pDb->pEnv, &pCsr->val, pVal, nVal);
  2923. pVal = pCsr->val.pData;
  2924. }
  2925. if( rc!=LSM_OK ){
  2926. pVal = 0;
  2927. nVal = 0;
  2928. }
  2929. }
  2930. *ppVal = pVal;
  2931. *pnVal = nVal;
  2932. return rc;
  2933. }
  2934. int lsmMCursorType(MultiCursor *pCsr, int *peType){
  2935. assert( pCsr->aTree );
  2936. multiCursorGetKey(pCsr, pCsr->aTree[1], peType, 0, 0);
  2937. return LSM_OK;
  2938. }
  2939. /*
  2940. ** Buffer aData[], size nData, is assumed to contain a valid b-tree
  2941. ** hierarchy page image. Return the offset in aData[] of the next free
  2942. ** byte in the data area (where a new cell may be written if there is
  2943. ** space).
  2944. */
  2945. static int mergeWorkerPageOffset(u8 *aData, int nData){
  2946. int nRec;
  2947. int iOff;
  2948. int nKey;
  2949. int eType;
  2950. i64 nDummy;
  2951. nRec = lsmGetU16(&aData[SEGMENT_NRECORD_OFFSET(nData)]);
  2952. iOff = lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, nRec-1)]);
  2953. eType = aData[iOff++];
  2954. assert( eType==0
  2955. || eType==(LSM_SYSTEMKEY|LSM_SEPARATOR)
  2956. || eType==(LSM_SEPARATOR)
  2957. );
  2958. iOff += lsmVarintGet64(&aData[iOff], &nDummy);
  2959. iOff += lsmVarintGet32(&aData[iOff], &nKey);
  2960. return iOff + (eType ? nKey : 0);
  2961. }
  2962. /*
  2963. ** Following a checkpoint operation, database pages that are part of the
  2964. ** checkpointed state of the LSM are deemed read-only. This includes the
  2965. ** right-most page of the b-tree hierarchy of any separators array under
  2966. ** construction, and all pages between it and the b-tree root, inclusive.
  2967. ** This is a problem, as when further pages are appended to the separators
  2968. ** array, entries must be added to the indicated b-tree hierarchy pages.
  2969. **
  2970. ** This function copies all such b-tree pages to new locations, so that
  2971. ** they can be modified as required.
  2972. **
  2973. ** The complication is that not all database pages are the same size - due
  2974. ** to the way the file.c module works some (the first and last in each block)
  2975. ** are 4 bytes smaller than the others.
  2976. */
  2977. static int mergeWorkerMoveHierarchy(
  2978. MergeWorker *pMW, /* Merge worker */
  2979. int bSep /* True for separators run */
  2980. ){
  2981. lsm_db *pDb = pMW->pDb; /* Database handle */
  2982. int rc = LSM_OK; /* Return code */
  2983. int i;
  2984. Page **apHier = pMW->hier.apHier;
  2985. int nHier = pMW->hier.nHier;
  2986. for(i=0; rc==LSM_OK && i<nHier; i++){
  2987. Page *pNew = 0;
  2988. rc = lsmFsSortedAppend(pDb->pFS, pDb->pWorker, pMW->pLevel, 1, &pNew);
  2989. assert( rc==LSM_OK );
  2990. if( rc==LSM_OK ){
  2991. u8 *a1; int n1;
  2992. u8 *a2; int n2;
  2993. a1 = fsPageData(pNew, &n1);
  2994. a2 = fsPageData(apHier[i], &n2);
  2995. assert( n1==n2 || n1+4==n2 );
  2996. if( n1==n2 ){
  2997. memcpy(a1, a2, n2);
  2998. }else{
  2999. int nEntry = pageGetNRec(a2, n2);
  3000. int iEof1 = SEGMENT_EOF(n1, nEntry);
  3001. int iEof2 = SEGMENT_EOF(n2, nEntry);
  3002. memcpy(a1, a2, iEof2 - 4);
  3003. memcpy(&a1[iEof1], &a2[iEof2], n2 - iEof2);
  3004. }
  3005. lsmFsPageRelease(apHier[i]);
  3006. apHier[i] = pNew;
  3007. #if 0
  3008. assert( n1==n2 || n1+4==n2 || n2+4==n1 );
  3009. if( n1>=n2 ){
  3010. /* If n1 (size of the new page) is equal to or greater than n2 (the
  3011. ** size of the old page), then copy the data into the new page. If
  3012. ** n1==n2, this could be done with a single memcpy(). However,
  3013. ** since sometimes n1>n2, the page content and footer must be copied
  3014. ** separately. */
  3015. int nEntry = pageGetNRec(a2, n2);
  3016. int iEof1 = SEGMENT_EOF(n1, nEntry);
  3017. int iEof2 = SEGMENT_EOF(n2, nEntry);
  3018. memcpy(a1, a2, iEof2);
  3019. memcpy(&a1[iEof1], &a2[iEof2], n2 - iEof2);
  3020. lsmFsPageRelease(apHier[i]);
  3021. apHier[i] = pNew;
  3022. }else{
  3023. lsmPutU16(&a1[SEGMENT_FLAGS_OFFSET(n1)], SEGMENT_BTREE_FLAG);
  3024. lsmPutU16(&a1[SEGMENT_NRECORD_OFFSET(n1)], 0);
  3025. lsmPutU64(&a1[SEGMENT_POINTER_OFFSET(n1)], 0);
  3026. i = i - 1;
  3027. lsmFsPageRelease(pNew);
  3028. }
  3029. #endif
  3030. }
  3031. }
  3032. #ifdef LSM_DEBUG
  3033. if( rc==LSM_OK ){
  3034. for(i=0; i<nHier; i++) assert( lsmFsPageWritable(apHier[i]) );
  3035. }
  3036. #endif
  3037. return rc;
  3038. }
  3039. /*
  3040. ** Allocate and populate the MergeWorker.apHier[] array.
  3041. */
  3042. static int mergeWorkerLoadHierarchy(MergeWorker *pMW){
  3043. int rc = LSM_OK;
  3044. Segment *pSeg;
  3045. Hierarchy *p;
  3046. pSeg = &pMW->pLevel->lhs;
  3047. p = &pMW->hier;
  3048. if( p->apHier==0 && pSeg->iRoot!=0 ){
  3049. FileSystem *pFS = pMW->pDb->pFS;
  3050. lsm_env *pEnv = pMW->pDb->pEnv;
  3051. Page **apHier = 0;
  3052. int nHier = 0;
  3053. LsmPgno iPg = pSeg->iRoot;
  3054. do {
  3055. Page *pPg = 0;
  3056. u8 *aData;
  3057. int nData;
  3058. int flags;
  3059. rc = lsmFsDbPageGet(pFS, pSeg, iPg, &pPg);
  3060. if( rc!=LSM_OK ) break;
  3061. aData = fsPageData(pPg, &nData);
  3062. flags = pageGetFlags(aData, nData);
  3063. if( flags&SEGMENT_BTREE_FLAG ){
  3064. Page **apNew = (Page **)lsmRealloc(
  3065. pEnv, apHier, sizeof(Page *)*(nHier+1)
  3066. );
  3067. if( apNew==0 ){
  3068. rc = LSM_NOMEM_BKPT;
  3069. break;
  3070. }
  3071. apHier = apNew;
  3072. memmove(&apHier[1], &apHier[0], sizeof(Page *) * nHier);
  3073. nHier++;
  3074. apHier[0] = pPg;
  3075. iPg = pageGetPtr(aData, nData);
  3076. }else{
  3077. lsmFsPageRelease(pPg);
  3078. break;
  3079. }
  3080. }while( 1 );
  3081. if( rc==LSM_OK ){
  3082. u8 *aData;
  3083. int nData;
  3084. aData = fsPageData(apHier[0], &nData);
  3085. pMW->aSave[0].iPgno = pageGetPtr(aData, nData);
  3086. p->nHier = nHier;
  3087. p->apHier = apHier;
  3088. rc = mergeWorkerMoveHierarchy(pMW, 0);
  3089. }else{
  3090. int i;
  3091. for(i=0; i<nHier; i++){
  3092. lsmFsPageRelease(apHier[i]);
  3093. }
  3094. lsmFree(pEnv, apHier);
  3095. }
  3096. }
  3097. return rc;
  3098. }
  3099. /*
  3100. ** B-tree pages use almost the same format as regular pages. The
  3101. ** differences are:
  3102. **
  3103. ** 1. The record format is (usually, see below) as follows:
  3104. **
  3105. ** + Type byte (always SORTED_SEPARATOR or SORTED_SYSTEM_SEPARATOR),
  3106. ** + Absolute pointer value (varint),
  3107. ** + Number of bytes in key (varint),
  3108. ** + LsmBlob containing key data.
  3109. **
  3110. ** 2. All pointer values are stored as absolute values (not offsets
  3111. ** relative to the footer pointer value).
  3112. **
  3113. ** 3. Each pointer that is part of a record points to a page that
  3114. ** contains keys smaller than the records key (note: not "equal to or
  3115. ** smaller than - smaller than").
  3116. **
  3117. ** 4. The pointer in the page footer of a b-tree page points to a page
  3118. ** that contains keys equal to or larger than the largest key on the
  3119. ** b-tree page.
  3120. **
  3121. ** The reason for having the page footer pointer point to the right-child
  3122. ** (instead of the left) is that doing things this way makes the
  3123. ** mergeWorkerMoveHierarchy() operation less complicated (since the pointers
  3124. ** that need to be updated are all stored as fixed-size integers within the
  3125. ** page footer, not varints in page records).
  3126. **
  3127. ** Records may not span b-tree pages. If this function is called to add a
  3128. ** record larger than (page-size / 4) bytes, then a pointer to the indexed
  3129. ** array page that contains the main record is added to the b-tree instead.
  3130. ** In this case the record format is:
  3131. **
  3132. ** + 0x00 byte (1 byte)
  3133. ** + Absolute pointer value (varint),
  3134. ** + Absolute page number of page containing key (varint).
  3135. **
  3136. ** See function seekInBtree() for the code that traverses b-tree pages.
  3137. */
  3138. static int mergeWorkerBtreeWrite(
  3139. MergeWorker *pMW,
  3140. u8 eType,
  3141. LsmPgno iPtr,
  3142. LsmPgno iKeyPg,
  3143. void *pKey,
  3144. int nKey
  3145. ){
  3146. Hierarchy *p = &pMW->hier;
  3147. lsm_db *pDb = pMW->pDb; /* Database handle */
  3148. int rc = LSM_OK; /* Return Code */
  3149. int iLevel; /* Level of b-tree hierachy to write to */
  3150. int nData; /* Size of aData[] in bytes */
  3151. u8 *aData; /* Page data for level iLevel */
  3152. int iOff; /* Offset on b-tree page to write record to */
  3153. int nRec; /* Initial number of records on b-tree page */
  3154. /* iKeyPg should be zero for an ordinary b-tree key, or non-zero for an
  3155. ** indirect key. The flags byte for an indirect key is 0x00. */
  3156. assert( (eType==0)==(iKeyPg!=0) );
  3157. /* The MergeWorker.apHier[] array contains the right-most leaf of the b-tree
  3158. ** hierarchy, the root node, and all nodes that lie on the path between.
  3159. ** apHier[0] is the right-most leaf and apHier[pMW->nHier-1] is the current
  3160. ** root page.
  3161. **
  3162. ** This loop searches for a node with enough space to store the key on,
  3163. ** starting with the leaf and iterating up towards the root. When the loop
  3164. ** exits, the key may be written to apHier[iLevel]. */
  3165. for(iLevel=0; iLevel<=p->nHier; iLevel++){
  3166. int nByte; /* Number of free bytes required */
  3167. if( iLevel==p->nHier ){
  3168. /* Extend the array and allocate a new root page. */
  3169. Page **aNew;
  3170. aNew = (Page **)lsmRealloc(
  3171. pMW->pDb->pEnv, p->apHier, sizeof(Page *)*(p->nHier+1)
  3172. );
  3173. if( !aNew ){
  3174. return LSM_NOMEM_BKPT;
  3175. }
  3176. p->apHier = aNew;
  3177. }else{
  3178. Page *pOld;
  3179. int nFree;
  3180. /* If the key will fit on this page, break out of the loop here.
  3181. ** The new entry will be written to page apHier[iLevel]. */
  3182. pOld = p->apHier[iLevel];
  3183. assert( lsmFsPageWritable(pOld) );
  3184. aData = fsPageData(pOld, &nData);
  3185. if( eType==0 ){
  3186. nByte = 2 + 1 + lsmVarintLen64(iPtr) + lsmVarintLen64(iKeyPg);
  3187. }else{
  3188. nByte = 2 + 1 + lsmVarintLen64(iPtr) + lsmVarintLen32(nKey) + nKey;
  3189. }
  3190. nRec = pageGetNRec(aData, nData);
  3191. nFree = SEGMENT_EOF(nData, nRec) - mergeWorkerPageOffset(aData, nData);
  3192. if( nByte<=nFree ) break;
  3193. /* Otherwise, this page is full. Set the right-hand-child pointer
  3194. ** to iPtr and release it. */
  3195. lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iPtr);
  3196. assert( lsmFsPageNumber(pOld)==0 );
  3197. rc = lsmFsPagePersist(pOld);
  3198. if( rc==LSM_OK ){
  3199. iPtr = lsmFsPageNumber(pOld);
  3200. lsmFsPageRelease(pOld);
  3201. }
  3202. }
  3203. /* Allocate a new page for apHier[iLevel]. */
  3204. p->apHier[iLevel] = 0;
  3205. if( rc==LSM_OK ){
  3206. rc = lsmFsSortedAppend(
  3207. pDb->pFS, pDb->pWorker, pMW->pLevel, 1, &p->apHier[iLevel]
  3208. );
  3209. }
  3210. if( rc!=LSM_OK ) return rc;
  3211. aData = fsPageData(p->apHier[iLevel], &nData);
  3212. memset(aData, 0, nData);
  3213. lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], SEGMENT_BTREE_FLAG);
  3214. lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], 0);
  3215. if( iLevel==p->nHier ){
  3216. p->nHier++;
  3217. break;
  3218. }
  3219. }
  3220. /* Write the key into page apHier[iLevel]. */
  3221. aData = fsPageData(p->apHier[iLevel], &nData);
  3222. iOff = mergeWorkerPageOffset(aData, nData);
  3223. nRec = pageGetNRec(aData, nData);
  3224. lsmPutU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, nRec)], (u16)iOff);
  3225. lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], (u16)(nRec+1));
  3226. if( eType==0 ){
  3227. aData[iOff++] = 0x00;
  3228. iOff += lsmVarintPut64(&aData[iOff], iPtr);
  3229. iOff += lsmVarintPut64(&aData[iOff], iKeyPg);
  3230. }else{
  3231. aData[iOff++] = eType;
  3232. iOff += lsmVarintPut64(&aData[iOff], iPtr);
  3233. iOff += lsmVarintPut32(&aData[iOff], nKey);
  3234. memcpy(&aData[iOff], pKey, nKey);
  3235. }
  3236. return rc;
  3237. }
  3238. static int mergeWorkerBtreeIndirect(MergeWorker *pMW){
  3239. int rc = LSM_OK;
  3240. if( pMW->iIndirect ){
  3241. LsmPgno iKeyPg = pMW->aSave[1].iPgno;
  3242. rc = mergeWorkerBtreeWrite(pMW, 0, pMW->iIndirect, iKeyPg, 0, 0);
  3243. pMW->iIndirect = 0;
  3244. }
  3245. return rc;
  3246. }
  3247. /*
  3248. ** Append the database key (iTopic/pKey/nKey) to the b-tree under
  3249. ** construction. This key has not yet been written to a segment page.
  3250. ** The pointer that will accompany the new key in the b-tree - that
  3251. ** points to the completed segment page that contains keys smaller than
  3252. ** (pKey/nKey) is currently stored in pMW->aSave[0].iPgno.
  3253. */
  3254. static int mergeWorkerPushHierarchy(
  3255. MergeWorker *pMW, /* Merge worker object */
  3256. int iTopic, /* Topic value for this key */
  3257. void *pKey, /* Pointer to key buffer */
  3258. int nKey /* Size of pKey buffer in bytes */
  3259. ){
  3260. int rc = LSM_OK; /* Return Code */
  3261. LsmPgno iPtr; /* Pointer value to accompany pKey/nKey */
  3262. assert( pMW->aSave[0].bStore==0 );
  3263. assert( pMW->aSave[1].bStore==0 );
  3264. rc = mergeWorkerBtreeIndirect(pMW);
  3265. /* Obtain the absolute pointer value to store along with the key in the
  3266. ** page body. This pointer points to a page that contains keys that are
  3267. ** smaller than pKey/nKey. */
  3268. iPtr = pMW->aSave[0].iPgno;
  3269. assert( iPtr!=0 );
  3270. /* Determine if the indirect format should be used. */
  3271. if( (nKey*4 > lsmFsPageSize(pMW->pDb->pFS)) ){
  3272. pMW->iIndirect = iPtr;
  3273. pMW->aSave[1].bStore = 1;
  3274. }else{
  3275. rc = mergeWorkerBtreeWrite(
  3276. pMW, (u8)(iTopic | LSM_SEPARATOR), iPtr, 0, pKey, nKey
  3277. );
  3278. }
  3279. /* Ensure that the SortedRun.iRoot field is correct. */
  3280. return rc;
  3281. }
  3282. static int mergeWorkerFinishHierarchy(
  3283. MergeWorker *pMW /* Merge worker object */
  3284. ){
  3285. int i; /* Used to loop through apHier[] */
  3286. int rc = LSM_OK; /* Return code */
  3287. LsmPgno iPtr; /* New right-hand-child pointer value */
  3288. iPtr = pMW->aSave[0].iPgno;
  3289. for(i=0; i<pMW->hier.nHier && rc==LSM_OK; i++){
  3290. Page *pPg = pMW->hier.apHier[i];
  3291. int nData; /* Size of aData[] in bytes */
  3292. u8 *aData; /* Page data for pPg */
  3293. aData = fsPageData(pPg, &nData);
  3294. lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iPtr);
  3295. rc = lsmFsPagePersist(pPg);
  3296. iPtr = lsmFsPageNumber(pPg);
  3297. lsmFsPageRelease(pPg);
  3298. }
  3299. if( pMW->hier.nHier ){
  3300. pMW->pLevel->lhs.iRoot = iPtr;
  3301. lsmFree(pMW->pDb->pEnv, pMW->hier.apHier);
  3302. pMW->hier.apHier = 0;
  3303. pMW->hier.nHier = 0;
  3304. }
  3305. return rc;
  3306. }
  3307. static int mergeWorkerAddPadding(
  3308. MergeWorker *pMW /* Merge worker object */
  3309. ){
  3310. FileSystem *pFS = pMW->pDb->pFS;
  3311. return lsmFsSortedPadding(pFS, pMW->pDb->pWorker, &pMW->pLevel->lhs);
  3312. }
  3313. /*
  3314. ** Release all page references currently held by the merge-worker passed
  3315. ** as the only argument. Unless an error has occurred, all pages have
  3316. ** already been released.
  3317. */
  3318. static void mergeWorkerReleaseAll(MergeWorker *pMW){
  3319. int i;
  3320. lsmFsPageRelease(pMW->pPage);
  3321. pMW->pPage = 0;
  3322. for(i=0; i<pMW->hier.nHier; i++){
  3323. lsmFsPageRelease(pMW->hier.apHier[i]);
  3324. pMW->hier.apHier[i] = 0;
  3325. }
  3326. lsmFree(pMW->pDb->pEnv, pMW->hier.apHier);
  3327. pMW->hier.apHier = 0;
  3328. pMW->hier.nHier = 0;
  3329. }
  3330. static int keyszToSkip(FileSystem *pFS, int nKey){
  3331. int nPgsz; /* Nominal database page size */
  3332. nPgsz = lsmFsPageSize(pFS);
  3333. return LSM_MIN(((nKey * 4) / nPgsz), 3);
  3334. }
  3335. /*
  3336. ** Release the reference to the current output page of merge-worker *pMW
  3337. ** (reference pMW->pPage). Set the page number values in aSave[] as
  3338. ** required (see comments above struct MergeWorker for details).
  3339. */
  3340. static int mergeWorkerPersistAndRelease(MergeWorker *pMW){
  3341. int rc;
  3342. int i;
  3343. assert( pMW->pPage || (pMW->aSave[0].bStore==0 && pMW->aSave[1].bStore==0) );
  3344. /* Persist the page */
  3345. rc = lsmFsPagePersist(pMW->pPage);
  3346. /* If required, save the page number. */
  3347. for(i=0; i<2; i++){
  3348. if( pMW->aSave[i].bStore ){
  3349. pMW->aSave[i].iPgno = lsmFsPageNumber(pMW->pPage);
  3350. pMW->aSave[i].bStore = 0;
  3351. }
  3352. }
  3353. /* Release the completed output page. */
  3354. lsmFsPageRelease(pMW->pPage);
  3355. pMW->pPage = 0;
  3356. return rc;
  3357. }
  3358. /*
  3359. ** Advance to the next page of an output run being populated by merge-worker
  3360. ** pMW. The footer of the new page is initialized to indicate that it contains
  3361. ** zero records. The flags field is cleared. The page footer pointer field
  3362. ** is set to iFPtr.
  3363. **
  3364. ** If successful, LSM_OK is returned. Otherwise, an error code.
  3365. */
  3366. static int mergeWorkerNextPage(
  3367. MergeWorker *pMW, /* Merge worker object to append page to */
  3368. LsmPgno iFPtr /* Pointer value for footer of new page */
  3369. ){
  3370. int rc = LSM_OK; /* Return code */
  3371. Page *pNext = 0; /* New page appended to run */
  3372. lsm_db *pDb = pMW->pDb; /* Database handle */
  3373. rc = lsmFsSortedAppend(pDb->pFS, pDb->pWorker, pMW->pLevel, 0, &pNext);
  3374. assert( rc || pMW->pLevel->lhs.iFirst>0 || pMW->pDb->compress.xCompress );
  3375. if( rc==LSM_OK ){
  3376. u8 *aData; /* Data buffer belonging to page pNext */
  3377. int nData; /* Size of aData[] in bytes */
  3378. rc = mergeWorkerPersistAndRelease(pMW);
  3379. pMW->pPage = pNext;
  3380. pMW->pLevel->pMerge->iOutputOff = 0;
  3381. aData = fsPageData(pNext, &nData);
  3382. lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], 0);
  3383. lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], 0);
  3384. lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iFPtr);
  3385. pMW->nWork++;
  3386. }
  3387. return rc;
  3388. }
  3389. /*
  3390. ** Write a blob of data into an output segment being populated by a
  3391. ** merge-worker object. If argument bSep is true, write into the separators
  3392. ** array. Otherwise, the main array.
  3393. **
  3394. ** This function is used to write the blobs of data for keys and values.
  3395. */
  3396. static int mergeWorkerData(
  3397. MergeWorker *pMW, /* Merge worker object */
  3398. int bSep, /* True to write to separators run */
  3399. LsmPgno iFPtr, /* Footer ptr for new pages */
  3400. u8 *aWrite, /* Write data from this buffer */
  3401. int nWrite /* Size of aWrite[] in bytes */
  3402. ){
  3403. int rc = LSM_OK; /* Return code */
  3404. int nRem = nWrite; /* Number of bytes still to write */
  3405. while( rc==LSM_OK && nRem>0 ){
  3406. Merge *pMerge = pMW->pLevel->pMerge;
  3407. int nCopy; /* Number of bytes to copy */
  3408. u8 *aData; /* Pointer to buffer of current output page */
  3409. int nData; /* Size of aData[] in bytes */
  3410. int nRec; /* Number of records on current output page */
  3411. int iOff; /* Offset in aData[] to write to */
  3412. assert( lsmFsPageWritable(pMW->pPage) );
  3413. aData = fsPageData(pMW->pPage, &nData);
  3414. nRec = pageGetNRec(aData, nData);
  3415. iOff = pMerge->iOutputOff;
  3416. nCopy = LSM_MIN(nRem, SEGMENT_EOF(nData, nRec) - iOff);
  3417. memcpy(&aData[iOff], &aWrite[nWrite-nRem], nCopy);
  3418. nRem -= nCopy;
  3419. if( nRem>0 ){
  3420. rc = mergeWorkerNextPage(pMW, iFPtr);
  3421. }else{
  3422. pMerge->iOutputOff = iOff + nCopy;
  3423. }
  3424. }
  3425. return rc;
  3426. }
  3427. /*
  3428. ** The MergeWorker passed as the only argument is working to merge two or
  3429. ** more existing segments together (not to flush an in-memory tree). It
  3430. ** has not yet written the first key to the first page of the output.
  3431. */
  3432. static int mergeWorkerFirstPage(MergeWorker *pMW){
  3433. int rc = LSM_OK; /* Return code */
  3434. Page *pPg = 0; /* First page of run pSeg */
  3435. LsmPgno iFPtr = 0; /* Pointer value read from footer of pPg */
  3436. MultiCursor *pCsr = pMW->pCsr;
  3437. assert( pMW->pPage==0 );
  3438. if( pCsr->pBtCsr ){
  3439. rc = LSM_OK;
  3440. iFPtr = pMW->pLevel->pNext->lhs.iFirst;
  3441. }else if( pCsr->nPtr>0 ){
  3442. Segment *pSeg;
  3443. pSeg = pCsr->aPtr[pCsr->nPtr-1].pSeg;
  3444. rc = lsmFsDbPageGet(pMW->pDb->pFS, pSeg, pSeg->iFirst, &pPg);
  3445. if( rc==LSM_OK ){
  3446. u8 *aData; /* Buffer for page pPg */
  3447. int nData; /* Size of aData[] in bytes */
  3448. aData = fsPageData(pPg, &nData);
  3449. iFPtr = pageGetPtr(aData, nData);
  3450. lsmFsPageRelease(pPg);
  3451. }
  3452. }
  3453. if( rc==LSM_OK ){
  3454. rc = mergeWorkerNextPage(pMW, iFPtr);
  3455. if( pCsr->pPrevMergePtr ) *pCsr->pPrevMergePtr = iFPtr;
  3456. pMW->aSave[0].bStore = 1;
  3457. }
  3458. return rc;
  3459. }
  3460. static int mergeWorkerWrite(
  3461. MergeWorker *pMW, /* Merge worker object to write into */
  3462. int eType, /* One of SORTED_SEPARATOR, WRITE or DELETE */
  3463. void *pKey, int nKey, /* Key value */
  3464. void *pVal, int nVal, /* Value value */
  3465. LsmPgno iPtr /* Absolute value of page pointer, or 0 */
  3466. ){
  3467. int rc = LSM_OK; /* Return code */
  3468. Merge *pMerge; /* Persistent part of level merge state */
  3469. int nHdr; /* Space required for this record header */
  3470. Page *pPg; /* Page to write to */
  3471. u8 *aData; /* Data buffer for page pWriter->pPage */
  3472. int nData = 0; /* Size of buffer aData[] in bytes */
  3473. int nRec = 0; /* Number of records on page pPg */
  3474. LsmPgno iFPtr = 0; /* Value of pointer in footer of pPg */
  3475. LsmPgno iRPtr = 0; /* Value of pointer written into record */
  3476. int iOff = 0; /* Current write offset within page pPg */
  3477. Segment *pSeg; /* Segment being written */
  3478. int flags = 0; /* If != 0, flags value for page footer */
  3479. int bFirst = 0; /* True for first key of output run */
  3480. pMerge = pMW->pLevel->pMerge;
  3481. pSeg = &pMW->pLevel->lhs;
  3482. if( pSeg->iFirst==0 && pMW->pPage==0 ){
  3483. rc = mergeWorkerFirstPage(pMW);
  3484. bFirst = 1;
  3485. }
  3486. pPg = pMW->pPage;
  3487. if( pPg ){
  3488. aData = fsPageData(pPg, &nData);
  3489. nRec = pageGetNRec(aData, nData);
  3490. iFPtr = pageGetPtr(aData, nData);
  3491. iRPtr = iPtr ? (iPtr - iFPtr) : 0;
  3492. }
  3493. /* Figure out how much space is required by the new record. The space
  3494. ** required is divided into two sections: the header and the body. The
  3495. ** header consists of the intial varint fields. The body are the blobs
  3496. ** of data that correspond to the key and value data. The entire header
  3497. ** must be stored on the page. The body may overflow onto the next and
  3498. ** subsequent pages.
  3499. **
  3500. ** The header space is:
  3501. **
  3502. ** 1) record type - 1 byte.
  3503. ** 2) Page-pointer-offset - 1 varint
  3504. ** 3) Key size - 1 varint
  3505. ** 4) Value size - 1 varint (only if LSM_INSERT flag is set)
  3506. */
  3507. if( rc==LSM_OK ){
  3508. nHdr = 1 + lsmVarintLen64(iRPtr) + lsmVarintLen32(nKey);
  3509. if( rtIsWrite(eType) ) nHdr += lsmVarintLen32(nVal);
  3510. /* If the entire header will not fit on page pPg, or if page pPg is
  3511. ** marked read-only, advance to the next page of the output run. */
  3512. iOff = pMerge->iOutputOff;
  3513. if( iOff<0 || pPg==0 || iOff+nHdr > SEGMENT_EOF(nData, nRec+1) ){
  3514. if( iOff>=0 && pPg ){
  3515. /* Zero any free space on the page */
  3516. assert( aData );
  3517. memset(&aData[iOff], 0, SEGMENT_EOF(nData, nRec)-iOff);
  3518. }
  3519. iFPtr = *pMW->pCsr->pPrevMergePtr;
  3520. iRPtr = iPtr ? (iPtr - iFPtr) : 0;
  3521. iOff = 0;
  3522. nRec = 0;
  3523. rc = mergeWorkerNextPage(pMW, iFPtr);
  3524. pPg = pMW->pPage;
  3525. }
  3526. }
  3527. /* If this record header will be the first on the page, and the page is
  3528. ** not the very first in the entire run, add a copy of the key to the
  3529. ** b-tree hierarchy.
  3530. */
  3531. if( rc==LSM_OK && nRec==0 && bFirst==0 ){
  3532. assert( pMerge->nSkip>=0 );
  3533. if( pMerge->nSkip==0 ){
  3534. rc = mergeWorkerPushHierarchy(pMW, rtTopic(eType), pKey, nKey);
  3535. assert( pMW->aSave[0].bStore==0 );
  3536. pMW->aSave[0].bStore = 1;
  3537. pMerge->nSkip = keyszToSkip(pMW->pDb->pFS, nKey);
  3538. }else{
  3539. pMerge->nSkip--;
  3540. flags = PGFTR_SKIP_THIS_FLAG;
  3541. }
  3542. if( pMerge->nSkip ) flags |= PGFTR_SKIP_NEXT_FLAG;
  3543. }
  3544. /* Update the output segment */
  3545. if( rc==LSM_OK ){
  3546. aData = fsPageData(pPg, &nData);
  3547. /* Update the page footer. */
  3548. lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], (u16)(nRec+1));
  3549. lsmPutU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, nRec)], (u16)iOff);
  3550. if( flags ) lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], (u16)flags);
  3551. /* Write the entry header into the current page. */
  3552. aData[iOff++] = (u8)eType; /* 1 */
  3553. iOff += lsmVarintPut64(&aData[iOff], iRPtr); /* 2 */
  3554. iOff += lsmVarintPut32(&aData[iOff], nKey); /* 3 */
  3555. if( rtIsWrite(eType) ) iOff += lsmVarintPut32(&aData[iOff], nVal); /* 4 */
  3556. pMerge->iOutputOff = iOff;
  3557. /* Write the key and data into the segment. */
  3558. assert( iFPtr==pageGetPtr(aData, nData) );
  3559. rc = mergeWorkerData(pMW, 0, iFPtr+iRPtr, pKey, nKey);
  3560. if( rc==LSM_OK && rtIsWrite(eType) ){
  3561. if( rc==LSM_OK ){
  3562. rc = mergeWorkerData(pMW, 0, iFPtr+iRPtr, pVal, nVal);
  3563. }
  3564. }
  3565. }
  3566. return rc;
  3567. }
  3568. /*
  3569. ** Free all resources allocated by mergeWorkerInit().
  3570. */
  3571. static void mergeWorkerShutdown(MergeWorker *pMW, int *pRc){
  3572. int i; /* Iterator variable */
  3573. int rc = *pRc;
  3574. MultiCursor *pCsr = pMW->pCsr;
  3575. /* Unless the merge has finished, save the cursor position in the
  3576. ** Merge.aInput[] array. See function mergeWorkerInit() for the
  3577. ** code to restore a cursor position based on aInput[]. */
  3578. if( rc==LSM_OK && pCsr ){
  3579. Merge *pMerge = pMW->pLevel->pMerge;
  3580. if( lsmMCursorValid(pCsr) ){
  3581. int bBtree = (pCsr->pBtCsr!=0);
  3582. int iPtr;
  3583. /* pMerge->nInput==0 indicates that this is a FlushTree() operation. */
  3584. assert( pMerge->nInput==0 || pMW->pLevel->nRight>0 );
  3585. assert( pMerge->nInput==0 || pMerge->nInput==(pCsr->nPtr+bBtree) );
  3586. for(i=0; i<(pMerge->nInput-bBtree); i++){
  3587. SegmentPtr *pPtr = &pCsr->aPtr[i];
  3588. if( pPtr->pPg ){
  3589. pMerge->aInput[i].iPg = lsmFsPageNumber(pPtr->pPg);
  3590. pMerge->aInput[i].iCell = pPtr->iCell;
  3591. }else{
  3592. pMerge->aInput[i].iPg = 0;
  3593. pMerge->aInput[i].iCell = 0;
  3594. }
  3595. }
  3596. if( bBtree && pMerge->nInput ){
  3597. assert( i==pCsr->nPtr );
  3598. btreeCursorPosition(pCsr->pBtCsr, &pMerge->aInput[i]);
  3599. }
  3600. /* Store the location of the split-key */
  3601. iPtr = pCsr->aTree[1] - CURSOR_DATA_SEGMENT;
  3602. if( iPtr<pCsr->nPtr ){
  3603. pMerge->splitkey = pMerge->aInput[iPtr];
  3604. }else{
  3605. btreeCursorSplitkey(pCsr->pBtCsr, &pMerge->splitkey);
  3606. }
  3607. }
  3608. /* Zero any free space left on the final page. This helps with
  3609. ** compression if using a compression hook. And prevents valgrind
  3610. ** from complaining about uninitialized byte passed to write(). */
  3611. if( pMW->pPage ){
  3612. int nData;
  3613. u8 *aData = fsPageData(pMW->pPage, &nData);
  3614. int iOff = pMerge->iOutputOff;
  3615. int iEof = SEGMENT_EOF(nData, pageGetNRec(aData, nData));
  3616. memset(&aData[iOff], 0, iEof - iOff);
  3617. }
  3618. pMerge->iOutputOff = -1;
  3619. }
  3620. lsmMCursorClose(pCsr, 0);
  3621. /* Persist and release the output page. */
  3622. if( rc==LSM_OK ) rc = mergeWorkerPersistAndRelease(pMW);
  3623. if( rc==LSM_OK ) rc = mergeWorkerBtreeIndirect(pMW);
  3624. if( rc==LSM_OK ) rc = mergeWorkerFinishHierarchy(pMW);
  3625. if( rc==LSM_OK ) rc = mergeWorkerAddPadding(pMW);
  3626. lsmFsFlushWaiting(pMW->pDb->pFS, &rc);
  3627. mergeWorkerReleaseAll(pMW);
  3628. lsmFree(pMW->pDb->pEnv, pMW->aGobble);
  3629. pMW->aGobble = 0;
  3630. pMW->pCsr = 0;
  3631. *pRc = rc;
  3632. }
  3633. /*
  3634. ** The cursor passed as the first argument is being used as the input for
  3635. ** a merge operation. When this function is called, *piFlags contains the
  3636. ** database entry flags for the current entry. The entry about to be written
  3637. ** to the output.
  3638. **
  3639. ** Note that this function only has to work for cursors configured to
  3640. ** iterate forwards (not backwards).
  3641. */
  3642. static void mergeRangeDeletes(MultiCursor *pCsr, int *piVal, int *piFlags){
  3643. int f = *piFlags;
  3644. int iKey = pCsr->aTree[1];
  3645. int i;
  3646. assert( pCsr->flags & CURSOR_NEXT_OK );
  3647. if( pCsr->flags & CURSOR_IGNORE_DELETE ){
  3648. /* The ignore-delete flag is set when the output of the merge will form
  3649. ** the oldest level in the database. In this case there is no point in
  3650. ** retaining any range-delete flags. */
  3651. assert( (f & LSM_POINT_DELETE)==0 );
  3652. f &= ~(LSM_START_DELETE|LSM_END_DELETE);
  3653. }else{
  3654. for(i=0; i<(CURSOR_DATA_SEGMENT + pCsr->nPtr); i++){
  3655. if( i!=iKey ){
  3656. int eType;
  3657. void *pKey;
  3658. int nKey;
  3659. int res;
  3660. multiCursorGetKey(pCsr, i, &eType, &pKey, &nKey);
  3661. if( pKey ){
  3662. res = sortedKeyCompare(pCsr->pDb->xCmp,
  3663. rtTopic(pCsr->eType), pCsr->key.pData, pCsr->key.nData,
  3664. rtTopic(eType), pKey, nKey
  3665. );
  3666. assert( res<=0 );
  3667. if( res==0 ){
  3668. if( (f & (LSM_INSERT|LSM_POINT_DELETE))==0 ){
  3669. if( eType & LSM_INSERT ){
  3670. f |= LSM_INSERT;
  3671. *piVal = i;
  3672. }
  3673. else if( eType & LSM_POINT_DELETE ){
  3674. f |= LSM_POINT_DELETE;
  3675. }
  3676. }
  3677. f |= (eType & (LSM_END_DELETE|LSM_START_DELETE));
  3678. }
  3679. if( i>iKey && (eType & LSM_END_DELETE) && res<0 ){
  3680. if( f & (LSM_INSERT|LSM_POINT_DELETE) ){
  3681. f |= (LSM_END_DELETE|LSM_START_DELETE);
  3682. }else{
  3683. f = 0;
  3684. }
  3685. break;
  3686. }
  3687. }
  3688. }
  3689. }
  3690. assert( (f & LSM_INSERT)==0 || (f & LSM_POINT_DELETE)==0 );
  3691. if( (f & LSM_START_DELETE)
  3692. && (f & LSM_END_DELETE)
  3693. && (f & LSM_POINT_DELETE )
  3694. ){
  3695. f = 0;
  3696. }
  3697. }
  3698. *piFlags = f;
  3699. }
  3700. static int mergeWorkerStep(MergeWorker *pMW){
  3701. lsm_db *pDb = pMW->pDb; /* Database handle */
  3702. MultiCursor *pCsr; /* Cursor to read input data from */
  3703. int rc = LSM_OK; /* Return code */
  3704. int eType; /* SORTED_SEPARATOR, WRITE or DELETE */
  3705. void *pKey; int nKey; /* Key */
  3706. LsmPgno iPtr;
  3707. int iVal;
  3708. pCsr = pMW->pCsr;
  3709. /* Pull the next record out of the source cursor. */
  3710. lsmMCursorKey(pCsr, &pKey, &nKey);
  3711. eType = pCsr->eType;
  3712. /* Figure out if the output record may have a different pointer value
  3713. ** than the previous. This is the case if the current key is identical to
  3714. ** a key that appears in the lowest level run being merged. If so, set
  3715. ** iPtr to the absolute pointer value. If not, leave iPtr set to zero,
  3716. ** indicating that the output pointer value should be a copy of the pointer
  3717. ** value written with the previous key. */
  3718. iPtr = (pCsr->pPrevMergePtr ? *pCsr->pPrevMergePtr : 0);
  3719. if( pCsr->pBtCsr ){
  3720. BtreeCursor *pBtCsr = pCsr->pBtCsr;
  3721. if( pBtCsr->pKey ){
  3722. int res = rtTopic(pBtCsr->eType) - rtTopic(eType);
  3723. if( res==0 ) res = pDb->xCmp(pBtCsr->pKey, pBtCsr->nKey, pKey, nKey);
  3724. if( 0==res ) iPtr = pBtCsr->iPtr;
  3725. assert( res>=0 );
  3726. }
  3727. }else if( pCsr->nPtr ){
  3728. SegmentPtr *pPtr = &pCsr->aPtr[pCsr->nPtr-1];
  3729. if( pPtr->pPg
  3730. && 0==pDb->xCmp(pPtr->pKey, pPtr->nKey, pKey, nKey)
  3731. ){
  3732. iPtr = pPtr->iPtr+pPtr->iPgPtr;
  3733. }
  3734. }
  3735. iVal = pCsr->aTree[1];
  3736. mergeRangeDeletes(pCsr, &iVal, &eType);
  3737. if( eType!=0 ){
  3738. if( pMW->aGobble ){
  3739. int iGobble = pCsr->aTree[1] - CURSOR_DATA_SEGMENT;
  3740. if( iGobble<pCsr->nPtr && iGobble>=0 ){
  3741. SegmentPtr *pGobble = &pCsr->aPtr[iGobble];
  3742. if( (pGobble->flags & PGFTR_SKIP_THIS_FLAG)==0 ){
  3743. pMW->aGobble[iGobble] = lsmFsPageNumber(pGobble->pPg);
  3744. }
  3745. }
  3746. }
  3747. /* If this is a separator key and we know that the output pointer has not
  3748. ** changed, there is no point in writing an output record. Otherwise,
  3749. ** proceed. */
  3750. if( rc==LSM_OK && (rtIsSeparator(eType)==0 || iPtr!=0) ){
  3751. /* Write the record into the main run. */
  3752. void *pVal; int nVal;
  3753. rc = multiCursorGetVal(pCsr, iVal, &pVal, &nVal);
  3754. if( pVal && rc==LSM_OK ){
  3755. assert( nVal>=0 );
  3756. rc = sortedBlobSet(pDb->pEnv, &pCsr->val, pVal, nVal);
  3757. pVal = pCsr->val.pData;
  3758. }
  3759. if( rc==LSM_OK ){
  3760. rc = mergeWorkerWrite(pMW, eType, pKey, nKey, pVal, nVal, iPtr);
  3761. }
  3762. }
  3763. }
  3764. /* Advance the cursor to the next input record (assuming one exists). */
  3765. assert( lsmMCursorValid(pMW->pCsr) );
  3766. if( rc==LSM_OK ) rc = lsmMCursorNext(pMW->pCsr);
  3767. return rc;
  3768. }
  3769. static int mergeWorkerDone(MergeWorker *pMW){
  3770. return pMW->pCsr==0 || !lsmMCursorValid(pMW->pCsr);
  3771. }
  3772. static void sortedFreeLevel(lsm_env *pEnv, Level *p){
  3773. if( p ){
  3774. lsmFree(pEnv, p->pSplitKey);
  3775. lsmFree(pEnv, p->pMerge);
  3776. lsmFree(pEnv, p->aRhs);
  3777. lsmFree(pEnv, p);
  3778. }
  3779. }
  3780. static void sortedInvokeWorkHook(lsm_db *pDb){
  3781. if( pDb->xWork ){
  3782. pDb->xWork(pDb, pDb->pWorkCtx);
  3783. }
  3784. }
  3785. static int sortedNewToplevel(
  3786. lsm_db *pDb, /* Connection handle */
  3787. int eTree, /* One of the TREE_XXX constants */
  3788. int *pnWrite /* OUT: Number of database pages written */
  3789. ){
  3790. int rc = LSM_OK; /* Return Code */
  3791. MultiCursor *pCsr = 0;
  3792. Level *pNext = 0; /* The current top level */
  3793. Level *pNew; /* The new level itself */
  3794. Segment *pLinked = 0; /* Delete separators from this segment */
  3795. Level *pDel = 0; /* Delete this entire level */
  3796. int nWrite = 0; /* Number of database pages written */
  3797. Freelist freelist;
  3798. if( eTree!=TREE_NONE ){
  3799. rc = lsmShmCacheChunks(pDb, pDb->treehdr.nChunk);
  3800. }
  3801. assert( pDb->bUseFreelist==0 );
  3802. pDb->pFreelist = &freelist;
  3803. pDb->bUseFreelist = 1;
  3804. memset(&freelist, 0, sizeof(freelist));
  3805. /* Allocate the new level structure to write to. */
  3806. pNext = lsmDbSnapshotLevel(pDb->pWorker);
  3807. pNew = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc);
  3808. if( pNew ){
  3809. pNew->pNext = pNext;
  3810. lsmDbSnapshotSetLevel(pDb->pWorker, pNew);
  3811. }
  3812. /* Create a cursor to gather the data required by the new segment. The new
  3813. ** segment contains everything in the tree and pointers to the next segment
  3814. ** in the database (if any). */
  3815. pCsr = multiCursorNew(pDb, &rc);
  3816. if( pCsr ){
  3817. pCsr->pDb = pDb;
  3818. rc = multiCursorVisitFreelist(pCsr);
  3819. if( rc==LSM_OK ){
  3820. rc = multiCursorAddTree(pCsr, pDb->pWorker, eTree);
  3821. }
  3822. if( rc==LSM_OK && pNext && pNext->pMerge==0 ){
  3823. if( (pNext->flags & LEVEL_FREELIST_ONLY) ){
  3824. pDel = pNext;
  3825. pCsr->aPtr = lsmMallocZeroRc(pDb->pEnv, sizeof(SegmentPtr), &rc);
  3826. multiCursorAddOne(pCsr, pNext, &rc);
  3827. }else if( eTree!=TREE_NONE && pNext->lhs.iRoot ){
  3828. pLinked = &pNext->lhs;
  3829. rc = btreeCursorNew(pDb, pLinked, &pCsr->pBtCsr);
  3830. }
  3831. }
  3832. /* If this will be the only segment in the database, discard any delete
  3833. ** markers present in the in-memory tree. */
  3834. if( pNext==0 ){
  3835. multiCursorIgnoreDelete(pCsr);
  3836. }
  3837. }
  3838. if( rc!=LSM_OK ){
  3839. lsmMCursorClose(pCsr, 0);
  3840. }else{
  3841. LsmPgno iLeftPtr = 0;
  3842. Merge merge; /* Merge object used to create new level */
  3843. MergeWorker mergeworker; /* MergeWorker object for the same purpose */
  3844. memset(&merge, 0, sizeof(Merge));
  3845. memset(&mergeworker, 0, sizeof(MergeWorker));
  3846. pNew->pMerge = &merge;
  3847. pNew->flags |= LEVEL_INCOMPLETE;
  3848. mergeworker.pDb = pDb;
  3849. mergeworker.pLevel = pNew;
  3850. mergeworker.pCsr = pCsr;
  3851. pCsr->pPrevMergePtr = &iLeftPtr;
  3852. /* Mark the separators array for the new level as a "phantom". */
  3853. mergeworker.bFlush = 1;
  3854. /* Do the work to create the new merged segment on disk */
  3855. if( rc==LSM_OK ) rc = lsmMCursorFirst(pCsr);
  3856. while( rc==LSM_OK && mergeWorkerDone(&mergeworker)==0 ){
  3857. rc = mergeWorkerStep(&mergeworker);
  3858. }
  3859. mergeWorkerShutdown(&mergeworker, &rc);
  3860. assert( rc!=LSM_OK || mergeworker.nWork==0 || pNew->lhs.iFirst );
  3861. if( rc==LSM_OK && pNew->lhs.iFirst ){
  3862. rc = lsmFsSortedFinish(pDb->pFS, &pNew->lhs);
  3863. }
  3864. nWrite = mergeworker.nWork;
  3865. pNew->flags &= ~LEVEL_INCOMPLETE;
  3866. if( eTree==TREE_NONE ){
  3867. pNew->flags |= LEVEL_FREELIST_ONLY;
  3868. }
  3869. pNew->pMerge = 0;
  3870. }
  3871. if( rc!=LSM_OK || pNew->lhs.iFirst==0 ){
  3872. assert( rc!=LSM_OK || pDb->pWorker->freelist.nEntry==0 );
  3873. lsmDbSnapshotSetLevel(pDb->pWorker, pNext);
  3874. sortedFreeLevel(pDb->pEnv, pNew);
  3875. }else{
  3876. if( pLinked ){
  3877. pLinked->iRoot = 0;
  3878. }else if( pDel ){
  3879. assert( pNew->pNext==pDel );
  3880. pNew->pNext = pDel->pNext;
  3881. lsmFsSortedDelete(pDb->pFS, pDb->pWorker, 1, &pDel->lhs);
  3882. sortedFreeLevel(pDb->pEnv, pDel);
  3883. }
  3884. #if LSM_LOG_STRUCTURE
  3885. lsmSortedDumpStructure(pDb, pDb->pWorker, LSM_LOG_DATA, 0, "new-toplevel");
  3886. #endif
  3887. if( freelist.nEntry ){
  3888. Freelist *p = &pDb->pWorker->freelist;
  3889. lsmFree(pDb->pEnv, p->aEntry);
  3890. memcpy(p, &freelist, sizeof(freelist));
  3891. freelist.aEntry = 0;
  3892. }else{
  3893. pDb->pWorker->freelist.nEntry = 0;
  3894. }
  3895. assertBtreeOk(pDb, &pNew->lhs);
  3896. sortedInvokeWorkHook(pDb);
  3897. }
  3898. if( pnWrite ) *pnWrite = nWrite;
  3899. pDb->pWorker->nWrite += nWrite;
  3900. pDb->pFreelist = 0;
  3901. pDb->bUseFreelist = 0;
  3902. lsmFree(pDb->pEnv, freelist.aEntry);
  3903. return rc;
  3904. }
  3905. /*
  3906. ** The nMerge levels in the LSM beginning with pLevel consist of a
  3907. ** left-hand-side segment only. Replace these levels with a single new
  3908. ** level consisting of a new empty segment on the left-hand-side and the
  3909. ** nMerge segments from the replaced levels on the right-hand-side.
  3910. **
  3911. ** Also, allocate and populate a Merge object and set Level.pMerge to
  3912. ** point to it.
  3913. */
  3914. static int sortedMergeSetup(
  3915. lsm_db *pDb, /* Database handle */
  3916. Level *pLevel, /* First level to merge */
  3917. int nMerge, /* Merge this many levels together */
  3918. Level **ppNew /* New, merged, level */
  3919. ){
  3920. int rc = LSM_OK; /* Return Code */
  3921. Level *pNew; /* New Level object */
  3922. int bUseNext = 0; /* True to link in next separators */
  3923. Merge *pMerge; /* New Merge object */
  3924. int nByte; /* Bytes of space allocated at pMerge */
  3925. #ifdef LSM_DEBUG
  3926. int iLevel;
  3927. Level *pX = pLevel;
  3928. for(iLevel=0; iLevel<nMerge; iLevel++){
  3929. assert( pX->nRight==0 );
  3930. pX = pX->pNext;
  3931. }
  3932. #endif
  3933. /* Allocate the new Level object */
  3934. pNew = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc);
  3935. if( pNew ){
  3936. pNew->aRhs = (Segment *)lsmMallocZeroRc(pDb->pEnv,
  3937. nMerge * sizeof(Segment), &rc);
  3938. }
  3939. /* Populate the new Level object */
  3940. if( rc==LSM_OK ){
  3941. Level *pNext = 0; /* Level following pNew */
  3942. int i;
  3943. int bFreeOnly = 1;
  3944. Level *pTopLevel;
  3945. Level *p = pLevel;
  3946. Level **pp;
  3947. pNew->nRight = nMerge;
  3948. pNew->iAge = pLevel->iAge+1;
  3949. for(i=0; i<nMerge; i++){
  3950. assert( p->nRight==0 );
  3951. pNext = p->pNext;
  3952. pNew->aRhs[i] = p->lhs;
  3953. if( (p->flags & LEVEL_FREELIST_ONLY)==0 ) bFreeOnly = 0;
  3954. sortedFreeLevel(pDb->pEnv, p);
  3955. p = pNext;
  3956. }
  3957. if( bFreeOnly ) pNew->flags |= LEVEL_FREELIST_ONLY;
  3958. /* Replace the old levels with the new. */
  3959. pTopLevel = lsmDbSnapshotLevel(pDb->pWorker);
  3960. pNew->pNext = p;
  3961. for(pp=&pTopLevel; *pp!=pLevel; pp=&((*pp)->pNext));
  3962. *pp = pNew;
  3963. lsmDbSnapshotSetLevel(pDb->pWorker, pTopLevel);
  3964. /* Determine whether or not the next separators will be linked in */
  3965. if( pNext && pNext->pMerge==0 && pNext->lhs.iRoot && pNext
  3966. && (bFreeOnly==0 || (pNext->flags & LEVEL_FREELIST_ONLY))
  3967. ){
  3968. bUseNext = 1;
  3969. }
  3970. }
  3971. /* Allocate the merge object */
  3972. nByte = sizeof(Merge) + sizeof(MergeInput) * (nMerge + bUseNext);
  3973. pMerge = (Merge *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc);
  3974. if( pMerge ){
  3975. pMerge->aInput = (MergeInput *)&pMerge[1];
  3976. pMerge->nInput = nMerge + bUseNext;
  3977. pNew->pMerge = pMerge;
  3978. }
  3979. *ppNew = pNew;
  3980. return rc;
  3981. }
  3982. static int mergeWorkerInit(
  3983. lsm_db *pDb, /* Db connection to do merge work */
  3984. Level *pLevel, /* Level to work on merging */
  3985. MergeWorker *pMW /* Object to initialize */
  3986. ){
  3987. int rc = LSM_OK; /* Return code */
  3988. Merge *pMerge = pLevel->pMerge; /* Persistent part of merge state */
  3989. MultiCursor *pCsr = 0; /* Cursor opened for pMW */
  3990. Level *pNext = pLevel->pNext; /* Next level in LSM */
  3991. assert( pDb->pWorker );
  3992. assert( pLevel->pMerge );
  3993. assert( pLevel->nRight>0 );
  3994. memset(pMW, 0, sizeof(MergeWorker));
  3995. pMW->pDb = pDb;
  3996. pMW->pLevel = pLevel;
  3997. pMW->aGobble = lsmMallocZeroRc(pDb->pEnv, sizeof(LsmPgno)*pLevel->nRight,&rc);
  3998. /* Create a multi-cursor to read the data to write to the new
  3999. ** segment. The new segment contains:
  4000. **
  4001. ** 1. Records from LHS of each of the nMerge levels being merged.
  4002. ** 2. Separators from either the last level being merged, or the
  4003. ** separators attached to the LHS of the following level, or neither.
  4004. **
  4005. ** If the new level is the lowest (oldest) in the db, discard any
  4006. ** delete keys. Key annihilation.
  4007. */
  4008. pCsr = multiCursorNew(pDb, &rc);
  4009. if( pCsr ){
  4010. pCsr->flags |= CURSOR_NEXT_OK;
  4011. rc = multiCursorAddRhs(pCsr, pLevel);
  4012. }
  4013. if( rc==LSM_OK && pMerge->nInput > pLevel->nRight ){
  4014. rc = btreeCursorNew(pDb, &pNext->lhs, &pCsr->pBtCsr);
  4015. }else if( pNext ){
  4016. multiCursorReadSeparators(pCsr);
  4017. }else{
  4018. multiCursorIgnoreDelete(pCsr);
  4019. }
  4020. assert( rc!=LSM_OK || pMerge->nInput==(pCsr->nPtr+(pCsr->pBtCsr!=0)) );
  4021. pMW->pCsr = pCsr;
  4022. /* Load the b-tree hierarchy into memory. */
  4023. if( rc==LSM_OK ) rc = mergeWorkerLoadHierarchy(pMW);
  4024. if( rc==LSM_OK && pMW->hier.nHier==0 ){
  4025. pMW->aSave[0].iPgno = pLevel->lhs.iFirst;
  4026. }
  4027. /* Position the cursor. */
  4028. if( rc==LSM_OK ){
  4029. pCsr->pPrevMergePtr = &pMerge->iCurrentPtr;
  4030. if( pLevel->lhs.iFirst==0 ){
  4031. /* The output array is still empty. So position the cursor at the very
  4032. ** start of the input. */
  4033. rc = multiCursorEnd(pCsr, 0);
  4034. }else{
  4035. /* The output array is non-empty. Position the cursor based on the
  4036. ** page/cell data saved in the Merge.aInput[] array. */
  4037. int i;
  4038. for(i=0; rc==LSM_OK && i<pCsr->nPtr; i++){
  4039. MergeInput *pInput = &pMerge->aInput[i];
  4040. if( pInput->iPg ){
  4041. SegmentPtr *pPtr;
  4042. assert( pCsr->aPtr[i].pPg==0 );
  4043. pPtr = &pCsr->aPtr[i];
  4044. rc = segmentPtrLoadPage(pDb->pFS, pPtr, pInput->iPg);
  4045. if( rc==LSM_OK && pPtr->nCell>0 ){
  4046. rc = segmentPtrLoadCell(pPtr, pInput->iCell);
  4047. }
  4048. }
  4049. }
  4050. if( rc==LSM_OK && pCsr->pBtCsr ){
  4051. int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
  4052. assert( i==pCsr->nPtr );
  4053. rc = btreeCursorRestore(pCsr->pBtCsr, xCmp, &pMerge->aInput[i]);
  4054. }
  4055. if( rc==LSM_OK ){
  4056. rc = multiCursorSetupTree(pCsr, 0);
  4057. }
  4058. }
  4059. pCsr->flags |= CURSOR_NEXT_OK;
  4060. }
  4061. return rc;
  4062. }
  4063. static int sortedBtreeGobble(
  4064. lsm_db *pDb, /* Worker connection */
  4065. MultiCursor *pCsr, /* Multi-cursor being used for a merge */
  4066. int iGobble /* pCsr->aPtr[] entry to operate on */
  4067. ){
  4068. int rc = LSM_OK;
  4069. if( rtTopic(pCsr->eType)==0 ){
  4070. Segment *pSeg = pCsr->aPtr[iGobble].pSeg;
  4071. LsmPgno *aPg;
  4072. int nPg;
  4073. /* Seek from the root of the b-tree to the segment leaf that may contain
  4074. ** a key equal to the one multi-cursor currently points to. Record the
  4075. ** page number of each b-tree page and the leaf. The segment may be
  4076. ** gobbled up to (but not including) the first of these page numbers.
  4077. */
  4078. assert( pSeg->iRoot>0 );
  4079. aPg = lsmMallocZeroRc(pDb->pEnv, sizeof(LsmPgno)*32, &rc);
  4080. if( rc==LSM_OK ){
  4081. rc = seekInBtree(pCsr, pSeg,
  4082. rtTopic(pCsr->eType), pCsr->key.pData, pCsr->key.nData, aPg, 0
  4083. );
  4084. }
  4085. if( rc==LSM_OK ){
  4086. for(nPg=0; aPg[nPg]; nPg++);
  4087. lsmFsGobble(pDb, pSeg, aPg, nPg);
  4088. }
  4089. lsmFree(pDb->pEnv, aPg);
  4090. }
  4091. return rc;
  4092. }
  4093. /*
  4094. ** Argument p points to a level of age N. Return the number of levels in
  4095. ** the linked list starting at p that have age=N (always at least 1).
  4096. */
  4097. static int sortedCountLevels(Level *p){
  4098. int iAge = p->iAge;
  4099. int nRet = 0;
  4100. do {
  4101. nRet++;
  4102. p = p->pNext;
  4103. }while( p && p->iAge==iAge );
  4104. return nRet;
  4105. }
  4106. static int sortedSelectLevel(lsm_db *pDb, int nMerge, Level **ppOut){
  4107. Level *pTopLevel = lsmDbSnapshotLevel(pDb->pWorker);
  4108. int rc = LSM_OK;
  4109. Level *pLevel = 0; /* Output value */
  4110. Level *pBest = 0; /* Best level to work on found so far */
  4111. int nBest; /* Number of segments merged at pBest */
  4112. Level *pThis = 0; /* First in run of levels with age=iAge */
  4113. int nThis = 0; /* Number of levels starting at pThis */
  4114. assert( nMerge>=1 );
  4115. nBest = LSM_MAX(1, nMerge-1);
  4116. /* Find the longest contiguous run of levels not currently undergoing a
  4117. ** merge with the same age in the structure. Or the level being merged
  4118. ** with the largest number of right-hand segments. Work on it. */
  4119. for(pLevel=pTopLevel; pLevel; pLevel=pLevel->pNext){
  4120. if( pLevel->nRight==0 && pThis && pLevel->iAge==pThis->iAge ){
  4121. nThis++;
  4122. }else{
  4123. if( nThis>nBest ){
  4124. if( (pLevel->iAge!=pThis->iAge+1)
  4125. || (pLevel->nRight==0 && sortedCountLevels(pLevel)<=pDb->nMerge)
  4126. ){
  4127. pBest = pThis;
  4128. nBest = nThis;
  4129. }
  4130. }
  4131. if( pLevel->nRight ){
  4132. if( pLevel->nRight>nBest ){
  4133. nBest = pLevel->nRight;
  4134. pBest = pLevel;
  4135. }
  4136. nThis = 0;
  4137. pThis = 0;
  4138. }else{
  4139. pThis = pLevel;
  4140. nThis = 1;
  4141. }
  4142. }
  4143. }
  4144. if( nThis>nBest ){
  4145. assert( pThis );
  4146. pBest = pThis;
  4147. nBest = nThis;
  4148. }
  4149. if( pBest==0 && nMerge==1 ){
  4150. int nFree = 0;
  4151. int nUsr = 0;
  4152. for(pLevel=pTopLevel; pLevel; pLevel=pLevel->pNext){
  4153. assert( !pLevel->nRight );
  4154. if( pLevel->flags & LEVEL_FREELIST_ONLY ){
  4155. nFree++;
  4156. }else{
  4157. nUsr++;
  4158. }
  4159. }
  4160. if( nUsr>1 ){
  4161. pBest = pTopLevel;
  4162. nBest = nFree + nUsr;
  4163. }
  4164. }
  4165. if( pBest ){
  4166. if( pBest->nRight==0 ){
  4167. rc = sortedMergeSetup(pDb, pBest, nBest, ppOut);
  4168. }else{
  4169. *ppOut = pBest;
  4170. }
  4171. }
  4172. return rc;
  4173. }
  4174. static int sortedDbIsFull(lsm_db *pDb){
  4175. Level *pTop = lsmDbSnapshotLevel(pDb->pWorker);
  4176. if( lsmDatabaseFull(pDb) ) return 1;
  4177. if( pTop && pTop->iAge==0
  4178. && (pTop->nRight || sortedCountLevels(pTop)>=pDb->nMerge)
  4179. ){
  4180. return 1;
  4181. }
  4182. return 0;
  4183. }
  4184. typedef struct MoveBlockCtx MoveBlockCtx;
  4185. struct MoveBlockCtx {
  4186. int iSeen; /* Previous free block on list */
  4187. int iFrom; /* Total number of blocks in file */
  4188. };
  4189. static int moveBlockCb(void *pCtx, int iBlk, i64 iSnapshot){
  4190. MoveBlockCtx *p = (MoveBlockCtx *)pCtx;
  4191. assert( p->iFrom==0 );
  4192. if( iBlk==(p->iSeen-1) ){
  4193. p->iSeen = iBlk;
  4194. return 0;
  4195. }
  4196. p->iFrom = p->iSeen-1;
  4197. return 1;
  4198. }
  4199. /*
  4200. ** This function is called to further compact a database for which all
  4201. ** of the content has already been merged into a single segment. If
  4202. ** possible, it moves the contents of a single block from the end of the
  4203. ** file to a free-block that lies closer to the start of the file (allowing
  4204. ** the file to be eventually truncated).
  4205. */
  4206. static int sortedMoveBlock(lsm_db *pDb, int *pnWrite){
  4207. Snapshot *p = pDb->pWorker;
  4208. Level *pLvl = lsmDbSnapshotLevel(p);
  4209. int iFrom; /* Block to move */
  4210. int iTo; /* Destination to move block to */
  4211. int rc; /* Return code */
  4212. MoveBlockCtx sCtx;
  4213. assert( pLvl->pNext==0 && pLvl->nRight==0 );
  4214. assert( p->redirect.n<=LSM_MAX_BLOCK_REDIRECTS );
  4215. *pnWrite = 0;
  4216. /* Check that the redirect array is not already full. If it is, return
  4217. ** without moving any database content. */
  4218. if( p->redirect.n>=LSM_MAX_BLOCK_REDIRECTS ) return LSM_OK;
  4219. /* Find the last block of content in the database file. Do this by
  4220. ** traversing the free-list in reverse (descending block number) order.
  4221. ** The first block not on the free list is the one that will be moved.
  4222. ** Since the db consists of a single segment, there is no ambiguity as
  4223. ** to which segment the block belongs to. */
  4224. sCtx.iSeen = p->nBlock+1;
  4225. sCtx.iFrom = 0;
  4226. rc = lsmWalkFreelist(pDb, 1, moveBlockCb, &sCtx);
  4227. if( rc!=LSM_OK || sCtx.iFrom==0 ) return rc;
  4228. iFrom = sCtx.iFrom;
  4229. /* Find the first free block in the database, ignoring block 1. Block
  4230. ** 1 is tricky as it is smaller than the other blocks. */
  4231. rc = lsmBlockAllocate(pDb, iFrom, &iTo);
  4232. if( rc!=LSM_OK || iTo==0 ) return rc;
  4233. assert( iTo!=1 && iTo<iFrom );
  4234. rc = lsmFsMoveBlock(pDb->pFS, &pLvl->lhs, iTo, iFrom);
  4235. if( rc==LSM_OK ){
  4236. if( p->redirect.a==0 ){
  4237. int nByte = sizeof(struct RedirectEntry) * LSM_MAX_BLOCK_REDIRECTS;
  4238. p->redirect.a = lsmMallocZeroRc(pDb->pEnv, nByte, &rc);
  4239. }
  4240. if( rc==LSM_OK ){
  4241. /* Check if the block just moved was already redirected. */
  4242. int i;
  4243. for(i=0; i<p->redirect.n; i++){
  4244. if( p->redirect.a[i].iTo==iFrom ) break;
  4245. }
  4246. if( i==p->redirect.n ){
  4247. /* Block iFrom was not already redirected. Add a new array entry. */
  4248. memmove(&p->redirect.a[1], &p->redirect.a[0],
  4249. sizeof(struct RedirectEntry) * p->redirect.n
  4250. );
  4251. p->redirect.a[0].iFrom = iFrom;
  4252. p->redirect.a[0].iTo = iTo;
  4253. p->redirect.n++;
  4254. }else{
  4255. /* Block iFrom was already redirected. Overwrite existing entry. */
  4256. p->redirect.a[i].iTo = iTo;
  4257. }
  4258. rc = lsmBlockFree(pDb, iFrom);
  4259. *pnWrite = lsmFsBlockSize(pDb->pFS) / lsmFsPageSize(pDb->pFS);
  4260. pLvl->lhs.pRedirect = &p->redirect;
  4261. }
  4262. }
  4263. #if LSM_LOG_STRUCTURE
  4264. if( rc==LSM_OK ){
  4265. char aBuf[64];
  4266. sprintf(aBuf, "move-block %d/%d", p->redirect.n-1, LSM_MAX_BLOCK_REDIRECTS);
  4267. lsmSortedDumpStructure(pDb, pDb->pWorker, LSM_LOG_DATA, 0, aBuf);
  4268. }
  4269. #endif
  4270. return rc;
  4271. }
  4272. /*
  4273. */
  4274. static int mergeInsertFreelistSegments(
  4275. lsm_db *pDb,
  4276. int nFree,
  4277. MergeWorker *pMW
  4278. ){
  4279. int rc = LSM_OK;
  4280. if( nFree>0 ){
  4281. MultiCursor *pCsr = pMW->pCsr;
  4282. Level *pLvl = pMW->pLevel;
  4283. SegmentPtr *aNew1;
  4284. Segment *aNew2;
  4285. Level *pIter;
  4286. Level *pNext;
  4287. int i = 0;
  4288. aNew1 = (SegmentPtr *)lsmMallocZeroRc(
  4289. pDb->pEnv, sizeof(SegmentPtr) * (pCsr->nPtr+nFree), &rc
  4290. );
  4291. if( rc ) return rc;
  4292. memcpy(&aNew1[nFree], pCsr->aPtr, sizeof(SegmentPtr)*pCsr->nPtr);
  4293. pCsr->nPtr += nFree;
  4294. lsmFree(pDb->pEnv, pCsr->aTree);
  4295. lsmFree(pDb->pEnv, pCsr->aPtr);
  4296. pCsr->aTree = 0;
  4297. pCsr->aPtr = aNew1;
  4298. aNew2 = (Segment *)lsmMallocZeroRc(
  4299. pDb->pEnv, sizeof(Segment) * (pLvl->nRight+nFree), &rc
  4300. );
  4301. if( rc ) return rc;
  4302. memcpy(&aNew2[nFree], pLvl->aRhs, sizeof(Segment)*pLvl->nRight);
  4303. pLvl->nRight += nFree;
  4304. lsmFree(pDb->pEnv, pLvl->aRhs);
  4305. pLvl->aRhs = aNew2;
  4306. for(pIter=pDb->pWorker->pLevel; rc==LSM_OK && pIter!=pLvl; pIter=pNext){
  4307. Segment *pSeg = &pLvl->aRhs[i];
  4308. memcpy(pSeg, &pIter->lhs, sizeof(Segment));
  4309. pCsr->aPtr[i].pSeg = pSeg;
  4310. pCsr->aPtr[i].pLevel = pLvl;
  4311. rc = segmentPtrEnd(pCsr, &pCsr->aPtr[i], 0);
  4312. pDb->pWorker->pLevel = pNext = pIter->pNext;
  4313. sortedFreeLevel(pDb->pEnv, pIter);
  4314. i++;
  4315. }
  4316. assert( i==nFree );
  4317. assert( rc!=LSM_OK || pDb->pWorker->pLevel==pLvl );
  4318. for(i=nFree; i<pCsr->nPtr; i++){
  4319. pCsr->aPtr[i].pSeg = &pLvl->aRhs[i];
  4320. }
  4321. lsmFree(pDb->pEnv, pMW->aGobble);
  4322. pMW->aGobble = 0;
  4323. }
  4324. return rc;
  4325. }
  4326. static int sortedWork(
  4327. lsm_db *pDb, /* Database handle. Must be worker. */
  4328. int nWork, /* Number of pages of work to do */
  4329. int nMerge, /* Try to merge this many levels at once */
  4330. int bFlush, /* Set if call is to make room for a flush */
  4331. int *pnWrite /* OUT: Actual number of pages written */
  4332. ){
  4333. int rc = LSM_OK; /* Return Code */
  4334. int nRemaining = nWork; /* Units of work to do before returning */
  4335. Snapshot *pWorker = pDb->pWorker;
  4336. assert( pWorker );
  4337. if( lsmDbSnapshotLevel(pWorker)==0 ) return LSM_OK;
  4338. while( nRemaining>0 ){
  4339. Level *pLevel = 0;
  4340. /* Find a level to work on. */
  4341. rc = sortedSelectLevel(pDb, nMerge, &pLevel);
  4342. assert( rc==LSM_OK || pLevel==0 );
  4343. if( pLevel==0 ){
  4344. int nDone = 0;
  4345. Level *pTopLevel = lsmDbSnapshotLevel(pDb->pWorker);
  4346. if( bFlush==0 && nMerge==1 && pTopLevel && pTopLevel->pNext==0 ){
  4347. rc = sortedMoveBlock(pDb, &nDone);
  4348. }
  4349. nRemaining -= nDone;
  4350. /* Could not find any work to do. Finished. */
  4351. if( nDone==0 ) break;
  4352. }else{
  4353. int bSave = 0;
  4354. Freelist freelist = {0, 0, 0};
  4355. MergeWorker mergeworker; /* State used to work on the level merge */
  4356. assert( pDb->bIncrMerge==0 );
  4357. assert( pDb->pFreelist==0 && pDb->bUseFreelist==0 );
  4358. pDb->bIncrMerge = 1;
  4359. rc = mergeWorkerInit(pDb, pLevel, &mergeworker);
  4360. assert( mergeworker.nWork==0 );
  4361. while( rc==LSM_OK
  4362. && 0==mergeWorkerDone(&mergeworker)
  4363. && (mergeworker.nWork<nRemaining || pDb->bUseFreelist)
  4364. ){
  4365. int eType = rtTopic(mergeworker.pCsr->eType);
  4366. rc = mergeWorkerStep(&mergeworker);
  4367. /* If the cursor now points at the first entry past the end of the
  4368. ** user data (i.e. either to EOF or to the first free-list entry
  4369. ** that will be added to the run), then check if it is possible to
  4370. ** merge in any free-list entries that are either in-memory or in
  4371. ** free-list-only blocks. */
  4372. if( rc==LSM_OK && nMerge==1 && eType==0
  4373. && (rtTopic(mergeworker.pCsr->eType) || mergeWorkerDone(&mergeworker))
  4374. ){
  4375. int nFree = 0; /* Number of free-list-only levels to merge */
  4376. Level *pLvl;
  4377. assert( pDb->pFreelist==0 && pDb->bUseFreelist==0 );
  4378. /* Now check if all levels containing data newer than this one
  4379. ** are single-segment free-list only levels. If so, they will be
  4380. ** merged in now. */
  4381. for(pLvl=pDb->pWorker->pLevel;
  4382. pLvl!=mergeworker.pLevel && (pLvl->flags & LEVEL_FREELIST_ONLY);
  4383. pLvl=pLvl->pNext
  4384. ){
  4385. assert( pLvl->nRight==0 );
  4386. nFree++;
  4387. }
  4388. if( pLvl==mergeworker.pLevel ){
  4389. rc = mergeInsertFreelistSegments(pDb, nFree, &mergeworker);
  4390. if( rc==LSM_OK ){
  4391. rc = multiCursorVisitFreelist(mergeworker.pCsr);
  4392. }
  4393. if( rc==LSM_OK ){
  4394. rc = multiCursorSetupTree(mergeworker.pCsr, 0);
  4395. pDb->pFreelist = &freelist;
  4396. pDb->bUseFreelist = 1;
  4397. }
  4398. }
  4399. }
  4400. }
  4401. nRemaining -= LSM_MAX(mergeworker.nWork, 1);
  4402. if( rc==LSM_OK ){
  4403. /* Check if the merge operation is completely finished. If not,
  4404. ** gobble up (declare eligible for recycling) any pages from rhs
  4405. ** segments for which the content has been completely merged into
  4406. ** the lhs of the level. */
  4407. if( mergeWorkerDone(&mergeworker)==0 ){
  4408. int i;
  4409. for(i=0; i<pLevel->nRight; i++){
  4410. SegmentPtr *pGobble = &mergeworker.pCsr->aPtr[i];
  4411. if( pGobble->pSeg->iRoot ){
  4412. rc = sortedBtreeGobble(pDb, mergeworker.pCsr, i);
  4413. }else if( mergeworker.aGobble[i] ){
  4414. lsmFsGobble(pDb, pGobble->pSeg, &mergeworker.aGobble[i], 1);
  4415. }
  4416. }
  4417. }else{
  4418. int i;
  4419. int bEmpty;
  4420. mergeWorkerShutdown(&mergeworker, &rc);
  4421. bEmpty = (pLevel->lhs.iFirst==0);
  4422. if( bEmpty==0 && rc==LSM_OK ){
  4423. rc = lsmFsSortedFinish(pDb->pFS, &pLevel->lhs);
  4424. }
  4425. if( pDb->bUseFreelist ){
  4426. Freelist *p = &pDb->pWorker->freelist;
  4427. lsmFree(pDb->pEnv, p->aEntry);
  4428. memcpy(p, &freelist, sizeof(freelist));
  4429. pDb->bUseFreelist = 0;
  4430. pDb->pFreelist = 0;
  4431. bSave = 1;
  4432. }
  4433. for(i=0; i<pLevel->nRight; i++){
  4434. lsmFsSortedDelete(pDb->pFS, pWorker, 1, &pLevel->aRhs[i]);
  4435. }
  4436. if( bEmpty ){
  4437. /* If the new level is completely empty, remove it from the
  4438. ** database snapshot. This can only happen if all input keys were
  4439. ** annihilated. Since keys are only annihilated if the new level
  4440. ** is the last in the linked list (contains the most ancient of
  4441. ** database content), this guarantees that pLevel->pNext==0. */
  4442. Level *pTop; /* Top level of worker snapshot */
  4443. Level **pp; /* Read/write iterator for Level.pNext list */
  4444. assert( pLevel->pNext==0 );
  4445. /* Remove the level from the worker snapshot. */
  4446. pTop = lsmDbSnapshotLevel(pWorker);
  4447. for(pp=&pTop; *pp!=pLevel; pp=&((*pp)->pNext));
  4448. *pp = pLevel->pNext;
  4449. lsmDbSnapshotSetLevel(pWorker, pTop);
  4450. /* Free the Level structure. */
  4451. sortedFreeLevel(pDb->pEnv, pLevel);
  4452. }else{
  4453. /* Free the separators of the next level, if required. */
  4454. if( pLevel->pMerge->nInput > pLevel->nRight ){
  4455. assert( pLevel->pNext->lhs.iRoot );
  4456. pLevel->pNext->lhs.iRoot = 0;
  4457. }
  4458. /* Zero the right-hand-side of pLevel */
  4459. lsmFree(pDb->pEnv, pLevel->aRhs);
  4460. pLevel->nRight = 0;
  4461. pLevel->aRhs = 0;
  4462. /* Free the Merge object */
  4463. lsmFree(pDb->pEnv, pLevel->pMerge);
  4464. pLevel->pMerge = 0;
  4465. }
  4466. if( bSave && rc==LSM_OK ){
  4467. pDb->bIncrMerge = 0;
  4468. rc = lsmSaveWorker(pDb, 0);
  4469. }
  4470. }
  4471. }
  4472. /* Clean up the MergeWorker object initialized above. If no error
  4473. ** has occurred, invoke the work-hook to inform the application that
  4474. ** the database structure has changed. */
  4475. mergeWorkerShutdown(&mergeworker, &rc);
  4476. pDb->bIncrMerge = 0;
  4477. if( rc==LSM_OK ) sortedInvokeWorkHook(pDb);
  4478. #if LSM_LOG_STRUCTURE
  4479. lsmSortedDumpStructure(pDb, pDb->pWorker, LSM_LOG_DATA, 0, "work");
  4480. #endif
  4481. assertBtreeOk(pDb, &pLevel->lhs);
  4482. assertRunInOrder(pDb, &pLevel->lhs);
  4483. /* If bFlush is true and the database is no longer considered "full",
  4484. ** break out of the loop even if nRemaining is still greater than
  4485. ** zero. The caller has an in-memory tree to flush to disk. */
  4486. if( bFlush && sortedDbIsFull(pDb)==0 ) break;
  4487. }
  4488. }
  4489. if( pnWrite ) *pnWrite = (nWork - nRemaining);
  4490. pWorker->nWrite += (nWork - nRemaining);
  4491. #ifdef LSM_LOG_WORK
  4492. lsmLogMessage(pDb, rc, "sortedWork(): %d pages", (nWork-nRemaining));
  4493. #endif
  4494. return rc;
  4495. }
  4496. /*
  4497. ** The database connection passed as the first argument must be a worker
  4498. ** connection. This function checks if there exists an "old" in-memory tree
  4499. ** ready to be flushed to disk. If so, true is returned. Otherwise false.
  4500. **
  4501. ** If an error occurs, *pRc is set to an LSM error code before returning.
  4502. ** It is assumed that *pRc is set to LSM_OK when this function is called.
  4503. */
  4504. static int sortedTreeHasOld(lsm_db *pDb, int *pRc){
  4505. int rc = LSM_OK;
  4506. int bRet = 0;
  4507. assert( pDb->pWorker );
  4508. if( *pRc==LSM_OK ){
  4509. if( rc==LSM_OK
  4510. && pDb->treehdr.iOldShmid
  4511. && pDb->treehdr.iOldLog!=pDb->pWorker->iLogOff
  4512. ){
  4513. bRet = 1;
  4514. }else{
  4515. bRet = 0;
  4516. }
  4517. *pRc = rc;
  4518. }
  4519. assert( *pRc==LSM_OK || bRet==0 );
  4520. return bRet;
  4521. }
  4522. /*
  4523. ** Create a new free-list only top-level segment. Return LSM_OK if successful
  4524. ** or an LSM error code if some error occurs.
  4525. */
  4526. static int sortedNewFreelistOnly(lsm_db *pDb){
  4527. return sortedNewToplevel(pDb, TREE_NONE, 0);
  4528. }
  4529. int lsmSaveWorker(lsm_db *pDb, int bFlush){
  4530. Snapshot *p = pDb->pWorker;
  4531. if( p->freelist.nEntry>pDb->nMaxFreelist ){
  4532. int rc = sortedNewFreelistOnly(pDb);
  4533. if( rc!=LSM_OK ) return rc;
  4534. }
  4535. return lsmCheckpointSaveWorker(pDb, bFlush);
  4536. }
  4537. static int doLsmSingleWork(
  4538. lsm_db *pDb,
  4539. int bShutdown,
  4540. int nMerge, /* Minimum segments to merge together */
  4541. int nPage, /* Number of pages to write to disk */
  4542. int *pnWrite, /* OUT: Pages actually written to disk */
  4543. int *pbCkpt /* OUT: True if an auto-checkpoint is req. */
  4544. ){
  4545. Snapshot *pWorker; /* Worker snapshot */
  4546. int rc = LSM_OK; /* Return code */
  4547. int bDirty = 0;
  4548. int nMax = nPage; /* Maximum pages to write to disk */
  4549. int nRem = nPage;
  4550. int bCkpt = 0;
  4551. assert( nPage>0 );
  4552. /* Open the worker 'transaction'. It will be closed before this function
  4553. ** returns. */
  4554. assert( pDb->pWorker==0 );
  4555. rc = lsmBeginWork(pDb);
  4556. if( rc!=LSM_OK ) return rc;
  4557. pWorker = pDb->pWorker;
  4558. /* If this connection is doing auto-checkpoints, set nMax (and nRem) so
  4559. ** that this call stops writing when the auto-checkpoint is due. The
  4560. ** caller will do the checkpoint, then possibly call this function again. */
  4561. if( bShutdown==0 && pDb->nAutockpt ){
  4562. u32 nSync;
  4563. u32 nUnsync;
  4564. int nPgsz;
  4565. lsmCheckpointSynced(pDb, 0, 0, &nSync);
  4566. nUnsync = lsmCheckpointNWrite(pDb->pShmhdr->aSnap1, 0);
  4567. nPgsz = lsmCheckpointPgsz(pDb->pShmhdr->aSnap1);
  4568. nMax = (int)LSM_MIN(nMax, (pDb->nAutockpt/nPgsz) - (int)(nUnsync-nSync));
  4569. if( nMax<nRem ){
  4570. bCkpt = 1;
  4571. nRem = LSM_MAX(nMax, 0);
  4572. }
  4573. }
  4574. /* If there exists in-memory data ready to be flushed to disk, attempt
  4575. ** to flush it now. */
  4576. if( pDb->nTransOpen==0 ){
  4577. rc = lsmTreeLoadHeader(pDb, 0);
  4578. }
  4579. if( sortedTreeHasOld(pDb, &rc) ){
  4580. /* sortedDbIsFull() returns non-zero if either (a) there are too many
  4581. ** levels in total in the db, or (b) there are too many levels with the
  4582. ** the same age in the db. Either way, call sortedWork() to merge
  4583. ** existing segments together until this condition is cleared. */
  4584. if( sortedDbIsFull(pDb) ){
  4585. int nPg = 0;
  4586. rc = sortedWork(pDb, nRem, nMerge, 1, &nPg);
  4587. nRem -= nPg;
  4588. assert( rc!=LSM_OK || nRem<=0 || !sortedDbIsFull(pDb) );
  4589. bDirty = 1;
  4590. }
  4591. if( rc==LSM_OK && nRem>0 ){
  4592. int nPg = 0;
  4593. rc = sortedNewToplevel(pDb, TREE_OLD, &nPg);
  4594. nRem -= nPg;
  4595. if( rc==LSM_OK ){
  4596. if( pDb->nTransOpen>0 ){
  4597. lsmTreeDiscardOld(pDb);
  4598. }
  4599. rc = lsmSaveWorker(pDb, 1);
  4600. bDirty = 0;
  4601. }
  4602. }
  4603. }
  4604. /* If nPage is still greater than zero, do some merging. */
  4605. if( rc==LSM_OK && nRem>0 && bShutdown==0 ){
  4606. int nPg = 0;
  4607. rc = sortedWork(pDb, nRem, nMerge, 0, &nPg);
  4608. nRem -= nPg;
  4609. if( nPg ) bDirty = 1;
  4610. }
  4611. /* If the in-memory part of the free-list is too large, write a new
  4612. ** top-level containing just the in-memory free-list entries to disk. */
  4613. if( rc==LSM_OK && pDb->pWorker->freelist.nEntry > pDb->nMaxFreelist ){
  4614. while( rc==LSM_OK && lsmDatabaseFull(pDb) ){
  4615. int nPg = 0;
  4616. rc = sortedWork(pDb, 16, nMerge, 1, &nPg);
  4617. nRem -= nPg;
  4618. }
  4619. if( rc==LSM_OK ){
  4620. rc = sortedNewFreelistOnly(pDb);
  4621. }
  4622. bDirty = 1;
  4623. }
  4624. if( rc==LSM_OK ){
  4625. *pnWrite = (nMax - nRem);
  4626. *pbCkpt = (bCkpt && nRem<=0);
  4627. if( nMerge==1 && pDb->nAutockpt>0 && *pnWrite>0
  4628. && pWorker->pLevel
  4629. && pWorker->pLevel->nRight==0
  4630. && pWorker->pLevel->pNext==0
  4631. ){
  4632. *pbCkpt = 1;
  4633. }
  4634. }
  4635. if( rc==LSM_OK && bDirty ){
  4636. lsmFinishWork(pDb, 0, &rc);
  4637. }else{
  4638. int rcdummy = LSM_BUSY;
  4639. lsmFinishWork(pDb, 0, &rcdummy);
  4640. *pnWrite = 0;
  4641. }
  4642. assert( pDb->pWorker==0 );
  4643. return rc;
  4644. }
  4645. static int doLsmWork(lsm_db *pDb, int nMerge, int nPage, int *pnWrite){
  4646. int rc = LSM_OK; /* Return code */
  4647. int nWrite = 0; /* Number of pages written */
  4648. assert( nMerge>=1 );
  4649. if( nPage!=0 ){
  4650. int bCkpt = 0;
  4651. do {
  4652. int nThis = 0;
  4653. int nReq = (nPage>=0) ? (nPage-nWrite) : ((int)0x7FFFFFFF);
  4654. bCkpt = 0;
  4655. rc = doLsmSingleWork(pDb, 0, nMerge, nReq, &nThis, &bCkpt);
  4656. nWrite += nThis;
  4657. if( rc==LSM_OK && bCkpt ){
  4658. rc = lsm_checkpoint(pDb, 0);
  4659. }
  4660. }while( rc==LSM_OK && bCkpt && (nWrite<nPage || nPage<0) );
  4661. }
  4662. if( pnWrite ){
  4663. if( rc==LSM_OK ){
  4664. *pnWrite = nWrite;
  4665. }else{
  4666. *pnWrite = 0;
  4667. }
  4668. }
  4669. return rc;
  4670. }
  4671. /*
  4672. ** Perform work to merge database segments together.
  4673. */
  4674. int lsm_work(lsm_db *pDb, int nMerge, int nKB, int *pnWrite){
  4675. int rc; /* Return code */
  4676. int nPgsz; /* Nominal page size in bytes */
  4677. int nPage; /* Equivalent of nKB in pages */
  4678. int nWrite = 0; /* Number of pages written */
  4679. /* This function may not be called if pDb has an open read or write
  4680. ** transaction. Return LSM_MISUSE if an application attempts this. */
  4681. if( pDb->nTransOpen || pDb->pCsr ) return LSM_MISUSE_BKPT;
  4682. if( nMerge<=0 ) nMerge = pDb->nMerge;
  4683. lsmFsPurgeCache(pDb->pFS);
  4684. /* Convert from KB to pages */
  4685. nPgsz = lsmFsPageSize(pDb->pFS);
  4686. if( nKB>=0 ){
  4687. nPage = ((i64)nKB * 1024 + nPgsz - 1) / nPgsz;
  4688. }else{
  4689. nPage = -1;
  4690. }
  4691. rc = doLsmWork(pDb, nMerge, nPage, &nWrite);
  4692. if( pnWrite ){
  4693. /* Convert back from pages to KB */
  4694. *pnWrite = (int)(((i64)nWrite * 1024 + nPgsz - 1) / nPgsz);
  4695. }
  4696. return rc;
  4697. }
  4698. int lsm_flush(lsm_db *db){
  4699. int rc;
  4700. if( db->nTransOpen>0 || db->pCsr ){
  4701. rc = LSM_MISUSE_BKPT;
  4702. }else{
  4703. rc = lsmBeginWriteTrans(db);
  4704. if( rc==LSM_OK ){
  4705. lsmFlushTreeToDisk(db);
  4706. lsmTreeDiscardOld(db);
  4707. lsmTreeMakeOld(db);
  4708. lsmTreeDiscardOld(db);
  4709. }
  4710. if( rc==LSM_OK ){
  4711. rc = lsmFinishWriteTrans(db, 1);
  4712. }else{
  4713. lsmFinishWriteTrans(db, 0);
  4714. }
  4715. lsmFinishReadTrans(db);
  4716. }
  4717. return rc;
  4718. }
  4719. /*
  4720. ** This function is called in auto-work mode to perform merging work on
  4721. ** the data structure. It performs enough merging work to prevent the
  4722. ** height of the tree from growing indefinitely assuming that roughly
  4723. ** nUnit database pages worth of data have been written to the database
  4724. ** (i.e. the in-memory tree) since the last call.
  4725. */
  4726. int lsmSortedAutoWork(
  4727. lsm_db *pDb, /* Database handle */
  4728. int nUnit /* Pages of data written to in-memory tree */
  4729. ){
  4730. int rc = LSM_OK; /* Return code */
  4731. int nDepth = 0; /* Current height of tree (longest path) */
  4732. Level *pLevel; /* Used to iterate through levels */
  4733. int bRestore = 0;
  4734. assert( pDb->pWorker==0 );
  4735. assert( pDb->nTransOpen>0 );
  4736. /* Determine how many units of work to do before returning. One unit of
  4737. ** work is achieved by writing one page (~4KB) of merged data. */
  4738. for(pLevel=lsmDbSnapshotLevel(pDb->pClient); pLevel; pLevel=pLevel->pNext){
  4739. /* nDepth += LSM_MAX(1, pLevel->nRight); */
  4740. nDepth += 1;
  4741. }
  4742. if( lsmTreeHasOld(pDb) ){
  4743. nDepth += 1;
  4744. bRestore = 1;
  4745. rc = lsmSaveCursors(pDb);
  4746. if( rc!=LSM_OK ) return rc;
  4747. }
  4748. if( nDepth>0 ){
  4749. int nRemaining; /* Units of work to do before returning */
  4750. nRemaining = nUnit * nDepth;
  4751. #ifdef LSM_LOG_WORK
  4752. lsmLogMessage(pDb, rc, "lsmSortedAutoWork(): %d*%d = %d pages",
  4753. nUnit, nDepth, nRemaining);
  4754. #endif
  4755. assert( nRemaining>=0 );
  4756. rc = doLsmWork(pDb, pDb->nMerge, nRemaining, 0);
  4757. if( rc==LSM_BUSY ) rc = LSM_OK;
  4758. if( bRestore && pDb->pCsr ){
  4759. lsmMCursorFreeCache(pDb);
  4760. lsmFreeSnapshot(pDb->pEnv, pDb->pClient);
  4761. pDb->pClient = 0;
  4762. if( rc==LSM_OK ){
  4763. rc = lsmCheckpointLoad(pDb, 0);
  4764. }
  4765. if( rc==LSM_OK ){
  4766. rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot, &pDb->pClient);
  4767. }
  4768. if( rc==LSM_OK ){
  4769. rc = lsmRestoreCursors(pDb);
  4770. }
  4771. }
  4772. }
  4773. return rc;
  4774. }
  4775. /*
  4776. ** This function is only called during system shutdown. The contents of
  4777. ** any in-memory trees present (old or current) are written out to disk.
  4778. */
  4779. int lsmFlushTreeToDisk(lsm_db *pDb){
  4780. int rc;
  4781. rc = lsmBeginWork(pDb);
  4782. while( rc==LSM_OK && sortedDbIsFull(pDb) ){
  4783. rc = sortedWork(pDb, 256, pDb->nMerge, 1, 0);
  4784. }
  4785. if( rc==LSM_OK ){
  4786. rc = sortedNewToplevel(pDb, TREE_BOTH, 0);
  4787. }
  4788. lsmFinishWork(pDb, 1, &rc);
  4789. return rc;
  4790. }
  4791. /*
  4792. ** Return a string representation of the segment passed as the only argument.
  4793. ** Space for the returned string is allocated using lsmMalloc(), and should
  4794. ** be freed by the caller using lsmFree().
  4795. */
  4796. static char *segToString(lsm_env *pEnv, Segment *pSeg, int nMin){
  4797. LsmPgno nSize = pSeg->nSize;
  4798. LsmPgno iRoot = pSeg->iRoot;
  4799. LsmPgno iFirst = pSeg->iFirst;
  4800. LsmPgno iLast = pSeg->iLastPg;
  4801. char *z;
  4802. char *z1;
  4803. char *z2;
  4804. int nPad;
  4805. z1 = lsmMallocPrintf(pEnv, "%d.%d", iFirst, iLast);
  4806. if( iRoot ){
  4807. z2 = lsmMallocPrintf(pEnv, "root=%lld", iRoot);
  4808. }else{
  4809. z2 = lsmMallocPrintf(pEnv, "size=%lld", nSize);
  4810. }
  4811. nPad = nMin - 2 - strlen(z1) - 1 - strlen(z2);
  4812. nPad = LSM_MAX(0, nPad);
  4813. if( iRoot ){
  4814. z = lsmMallocPrintf(pEnv, "/%s %*s%s\\", z1, nPad, "", z2);
  4815. }else{
  4816. z = lsmMallocPrintf(pEnv, "|%s %*s%s|", z1, nPad, "", z2);
  4817. }
  4818. lsmFree(pEnv, z1);
  4819. lsmFree(pEnv, z2);
  4820. return z;
  4821. }
  4822. static int fileToString(
  4823. lsm_db *pDb, /* For xMalloc() */
  4824. char *aBuf,
  4825. int nBuf,
  4826. int nMin,
  4827. Segment *pSeg
  4828. ){
  4829. int i = 0;
  4830. if( pSeg ){
  4831. char *zSeg;
  4832. zSeg = segToString(pDb->pEnv, pSeg, nMin);
  4833. snprintf(&aBuf[i], nBuf-i, "%s", zSeg);
  4834. i += strlen(&aBuf[i]);
  4835. lsmFree(pDb->pEnv, zSeg);
  4836. #ifdef LSM_LOG_FREELIST
  4837. lsmInfoArrayStructure(pDb, 1, pSeg->iFirst, &zSeg);
  4838. snprintf(&aBuf[i], nBuf-1, " (%s)", zSeg);
  4839. i += strlen(&aBuf[i]);
  4840. lsmFree(pDb->pEnv, zSeg);
  4841. #endif
  4842. aBuf[nBuf] = 0;
  4843. }else{
  4844. aBuf[0] = '\0';
  4845. }
  4846. return i;
  4847. }
  4848. void sortedDumpPage(lsm_db *pDb, Segment *pRun, Page *pPg, int bVals){
  4849. LsmBlob blob = {0, 0, 0}; /* LsmBlob used for keys */
  4850. LsmString s;
  4851. int i;
  4852. int nRec;
  4853. LsmPgno iPtr;
  4854. int flags;
  4855. u8 *aData;
  4856. int nData;
  4857. aData = fsPageData(pPg, &nData);
  4858. nRec = pageGetNRec(aData, nData);
  4859. iPtr = pageGetPtr(aData, nData);
  4860. flags = pageGetFlags(aData, nData);
  4861. lsmStringInit(&s, pDb->pEnv);
  4862. lsmStringAppendf(&s,"nCell=%d iPtr=%lld flags=%d {", nRec, iPtr, flags);
  4863. if( flags&SEGMENT_BTREE_FLAG ) iPtr = 0;
  4864. for(i=0; i<nRec; i++){
  4865. Page *pRef = 0; /* Pointer to page iRef */
  4866. int iChar;
  4867. u8 *aKey; int nKey = 0; /* Key */
  4868. u8 *aVal = 0; int nVal = 0; /* Value */
  4869. int iTopic;
  4870. u8 *aCell;
  4871. i64 iPgPtr;
  4872. int eType;
  4873. aCell = pageGetCell(aData, nData, i);
  4874. eType = *aCell++;
  4875. assert( (flags & SEGMENT_BTREE_FLAG) || eType!=0 );
  4876. aCell += lsmVarintGet64(aCell, &iPgPtr);
  4877. if( eType==0 ){
  4878. LsmPgno iRef; /* Page number of referenced page */
  4879. aCell += lsmVarintGet64(aCell, &iRef);
  4880. lsmFsDbPageGet(pDb->pFS, pRun, iRef, &pRef);
  4881. aKey = pageGetKey(pRun, pRef, 0, &iTopic, &nKey, &blob);
  4882. }else{
  4883. aCell += lsmVarintGet32(aCell, &nKey);
  4884. if( rtIsWrite(eType) ) aCell += lsmVarintGet32(aCell, &nVal);
  4885. sortedReadData(0, pPg, (aCell-aData), nKey+nVal, (void **)&aKey, &blob);
  4886. aVal = &aKey[nKey];
  4887. iTopic = eType;
  4888. }
  4889. lsmStringAppendf(&s, "%s%2X:", (i==0?"":" "), iTopic);
  4890. for(iChar=0; iChar<nKey; iChar++){
  4891. lsmStringAppendf(&s, "%c", isalnum(aKey[iChar]) ? aKey[iChar] : '.');
  4892. }
  4893. if( nVal>0 && bVals ){
  4894. lsmStringAppendf(&s, "##");
  4895. for(iChar=0; iChar<nVal; iChar++){
  4896. lsmStringAppendf(&s, "%c", isalnum(aVal[iChar]) ? aVal[iChar] : '.');
  4897. }
  4898. }
  4899. lsmStringAppendf(&s, " %lld", iPgPtr+iPtr);
  4900. lsmFsPageRelease(pRef);
  4901. }
  4902. lsmStringAppend(&s, "}", 1);
  4903. lsmLogMessage(pDb, LSM_OK, " Page %d: %s", lsmFsPageNumber(pPg), s.z);
  4904. lsmStringClear(&s);
  4905. sortedBlobFree(&blob);
  4906. }
  4907. static void infoCellDump(
  4908. lsm_db *pDb, /* Database handle */
  4909. Segment *pSeg, /* Segment page belongs to */
  4910. int bIndirect, /* True to follow indirect refs */
  4911. Page *pPg,
  4912. int iCell,
  4913. int *peType,
  4914. int *piPgPtr,
  4915. u8 **paKey, int *pnKey,
  4916. u8 **paVal, int *pnVal,
  4917. LsmBlob *pBlob
  4918. ){
  4919. u8 *aData; int nData; /* Page data */
  4920. u8 *aKey; int nKey = 0; /* Key */
  4921. u8 *aVal = 0; int nVal = 0; /* Value */
  4922. int eType;
  4923. int iPgPtr;
  4924. Page *pRef = 0; /* Pointer to page iRef */
  4925. u8 *aCell;
  4926. aData = fsPageData(pPg, &nData);
  4927. aCell = pageGetCell(aData, nData, iCell);
  4928. eType = *aCell++;
  4929. aCell += lsmVarintGet32(aCell, &iPgPtr);
  4930. if( eType==0 ){
  4931. int dummy;
  4932. LsmPgno iRef; /* Page number of referenced page */
  4933. aCell += lsmVarintGet64(aCell, &iRef);
  4934. if( bIndirect ){
  4935. lsmFsDbPageGet(pDb->pFS, pSeg, iRef, &pRef);
  4936. pageGetKeyCopy(pDb->pEnv, pSeg, pRef, 0, &dummy, pBlob);
  4937. aKey = (u8 *)pBlob->pData;
  4938. nKey = pBlob->nData;
  4939. lsmFsPageRelease(pRef);
  4940. }else{
  4941. aKey = (u8 *)"<indirect>";
  4942. nKey = 11;
  4943. }
  4944. }else{
  4945. aCell += lsmVarintGet32(aCell, &nKey);
  4946. if( rtIsWrite(eType) ) aCell += lsmVarintGet32(aCell, &nVal);
  4947. sortedReadData(pSeg, pPg, (aCell-aData), nKey+nVal, (void **)&aKey, pBlob);
  4948. aVal = &aKey[nKey];
  4949. }
  4950. if( peType ) *peType = eType;
  4951. if( piPgPtr ) *piPgPtr = iPgPtr;
  4952. if( paKey ) *paKey = aKey;
  4953. if( paVal ) *paVal = aVal;
  4954. if( pnKey ) *pnKey = nKey;
  4955. if( pnVal ) *pnVal = nVal;
  4956. }
  4957. static int infoAppendBlob(LsmString *pStr, int bHex, u8 *z, int n){
  4958. int iChar;
  4959. for(iChar=0; iChar<n; iChar++){
  4960. if( bHex ){
  4961. lsmStringAppendf(pStr, "%02X", z[iChar]);
  4962. }else{
  4963. lsmStringAppendf(pStr, "%c", isalnum(z[iChar]) ?z[iChar] : '.');
  4964. }
  4965. }
  4966. return LSM_OK;
  4967. }
  4968. #define INFO_PAGE_DUMP_DATA 0x01
  4969. #define INFO_PAGE_DUMP_VALUES 0x02
  4970. #define INFO_PAGE_DUMP_HEX 0x04
  4971. #define INFO_PAGE_DUMP_INDIRECT 0x08
  4972. static int infoPageDump(
  4973. lsm_db *pDb, /* Database handle */
  4974. LsmPgno iPg, /* Page number of page to dump */
  4975. int flags,
  4976. char **pzOut /* OUT: lsmMalloc'd string */
  4977. ){
  4978. int rc = LSM_OK; /* Return code */
  4979. Page *pPg = 0; /* Handle for page iPg */
  4980. int i, j; /* Loop counters */
  4981. const int perLine = 16; /* Bytes per line in the raw hex dump */
  4982. Segment *pSeg = 0;
  4983. Snapshot *pSnap;
  4984. int bValues = (flags & INFO_PAGE_DUMP_VALUES);
  4985. int bHex = (flags & INFO_PAGE_DUMP_HEX);
  4986. int bData = (flags & INFO_PAGE_DUMP_DATA);
  4987. int bIndirect = (flags & INFO_PAGE_DUMP_INDIRECT);
  4988. *pzOut = 0;
  4989. if( iPg==0 ) return LSM_ERROR;
  4990. assert( pDb->pClient || pDb->pWorker );
  4991. pSnap = pDb->pClient;
  4992. if( pSnap==0 ) pSnap = pDb->pWorker;
  4993. if( pSnap->redirect.n>0 ){
  4994. Level *pLvl;
  4995. int bUse = 0;
  4996. for(pLvl=pSnap->pLevel; pLvl->pNext; pLvl=pLvl->pNext);
  4997. pSeg = (pLvl->nRight==0 ? &pLvl->lhs : &pLvl->aRhs[pLvl->nRight-1]);
  4998. rc = lsmFsSegmentContainsPg(pDb->pFS, pSeg, iPg, &bUse);
  4999. if( bUse==0 ){
  5000. pSeg = 0;
  5001. }
  5002. }
  5003. /* iPg is a real page number (not subject to redirection). So it is safe
  5004. ** to pass a NULL in place of the segment pointer as the second argument
  5005. ** to lsmFsDbPageGet() here. */
  5006. if( rc==LSM_OK ){
  5007. rc = lsmFsDbPageGet(pDb->pFS, 0, iPg, &pPg);
  5008. }
  5009. if( rc==LSM_OK ){
  5010. LsmBlob blob = {0, 0, 0, 0};
  5011. int nKeyWidth = 0;
  5012. LsmString str;
  5013. int nRec;
  5014. LsmPgno iPtr;
  5015. int flags2;
  5016. int iCell;
  5017. u8 *aData; int nData; /* Page data and size thereof */
  5018. aData = fsPageData(pPg, &nData);
  5019. nRec = pageGetNRec(aData, nData);
  5020. iPtr = pageGetPtr(aData, nData);
  5021. flags2 = pageGetFlags(aData, nData);
  5022. lsmStringInit(&str, pDb->pEnv);
  5023. lsmStringAppendf(&str, "Page : %lld (%d bytes)\n", iPg, nData);
  5024. lsmStringAppendf(&str, "nRec : %d\n", nRec);
  5025. lsmStringAppendf(&str, "iPtr : %lld\n", iPtr);
  5026. lsmStringAppendf(&str, "flags: %04x\n", flags2);
  5027. lsmStringAppendf(&str, "\n");
  5028. for(iCell=0; iCell<nRec; iCell++){
  5029. int nKey;
  5030. infoCellDump(
  5031. pDb, pSeg, bIndirect, pPg, iCell, 0, 0, 0, &nKey, 0, 0, &blob
  5032. );
  5033. if( nKey>nKeyWidth ) nKeyWidth = nKey;
  5034. }
  5035. if( bHex ) nKeyWidth = nKeyWidth * 2;
  5036. for(iCell=0; iCell<nRec; iCell++){
  5037. u8 *aKey; int nKey = 0; /* Key */
  5038. u8 *aVal; int nVal = 0; /* Value */
  5039. int iPgPtr;
  5040. int eType;
  5041. LsmPgno iAbsPtr;
  5042. char zFlags[8];
  5043. infoCellDump(pDb, pSeg, bIndirect, pPg, iCell, &eType, &iPgPtr,
  5044. &aKey, &nKey, &aVal, &nVal, &blob
  5045. );
  5046. iAbsPtr = iPgPtr + ((flags2 & SEGMENT_BTREE_FLAG) ? 0 : iPtr);
  5047. lsmFlagsToString(eType, zFlags);
  5048. lsmStringAppendf(&str, "%s %d (%s) ",
  5049. zFlags, iAbsPtr, (rtTopic(eType) ? "sys" : "usr")
  5050. );
  5051. infoAppendBlob(&str, bHex, aKey, nKey);
  5052. if( nVal>0 && bValues ){
  5053. lsmStringAppendf(&str, "%*s", nKeyWidth - (nKey*(1+bHex)), "");
  5054. lsmStringAppendf(&str, " ");
  5055. infoAppendBlob(&str, bHex, aVal, nVal);
  5056. }
  5057. if( rtTopic(eType) ){
  5058. int iBlk = (int)~lsmGetU32(aKey);
  5059. lsmStringAppendf(&str, " (block=%d", iBlk);
  5060. if( nVal>0 ){
  5061. i64 iSnap = lsmGetU64(aVal);
  5062. lsmStringAppendf(&str, " snapshot=%lld", iSnap);
  5063. }
  5064. lsmStringAppendf(&str, ")");
  5065. }
  5066. lsmStringAppendf(&str, "\n");
  5067. }
  5068. if( bData ){
  5069. lsmStringAppendf(&str, "\n-------------------"
  5070. "-------------------------------------------------------------\n");
  5071. lsmStringAppendf(&str, "Page %d\n",
  5072. iPg, (iPg-1)*nData, iPg*nData - 1);
  5073. for(i=0; i<nData; i += perLine){
  5074. lsmStringAppendf(&str, "%04x: ", i);
  5075. for(j=0; j<perLine; j++){
  5076. if( i+j>nData ){
  5077. lsmStringAppendf(&str, " ");
  5078. }else{
  5079. lsmStringAppendf(&str, "%02x ", aData[i+j]);
  5080. }
  5081. }
  5082. lsmStringAppendf(&str, " ");
  5083. for(j=0; j<perLine; j++){
  5084. if( i+j>nData ){
  5085. lsmStringAppendf(&str, " ");
  5086. }else{
  5087. lsmStringAppendf(&str,"%c", isprint(aData[i+j]) ? aData[i+j] : '.');
  5088. }
  5089. }
  5090. lsmStringAppendf(&str,"\n");
  5091. }
  5092. }
  5093. *pzOut = str.z;
  5094. sortedBlobFree(&blob);
  5095. lsmFsPageRelease(pPg);
  5096. }
  5097. return rc;
  5098. }
  5099. int lsmInfoPageDump(
  5100. lsm_db *pDb, /* Database handle */
  5101. LsmPgno iPg, /* Page number of page to dump */
  5102. int bHex, /* True to output key/value in hex form */
  5103. char **pzOut /* OUT: lsmMalloc'd string */
  5104. ){
  5105. int flags = INFO_PAGE_DUMP_DATA | INFO_PAGE_DUMP_VALUES;
  5106. if( bHex ) flags |= INFO_PAGE_DUMP_HEX;
  5107. return infoPageDump(pDb, iPg, flags, pzOut);
  5108. }
  5109. void sortedDumpSegment(lsm_db *pDb, Segment *pRun, int bVals){
  5110. assert( pDb->xLog );
  5111. if( pRun && pRun->iFirst ){
  5112. int flags = (bVals ? INFO_PAGE_DUMP_VALUES : 0);
  5113. char *zSeg;
  5114. Page *pPg;
  5115. zSeg = segToString(pDb->pEnv, pRun, 0);
  5116. lsmLogMessage(pDb, LSM_OK, "Segment: %s", zSeg);
  5117. lsmFree(pDb->pEnv, zSeg);
  5118. lsmFsDbPageGet(pDb->pFS, pRun, pRun->iFirst, &pPg);
  5119. while( pPg ){
  5120. Page *pNext;
  5121. char *z = 0;
  5122. infoPageDump(pDb, lsmFsPageNumber(pPg), flags, &z);
  5123. lsmLogMessage(pDb, LSM_OK, "%s", z);
  5124. lsmFree(pDb->pEnv, z);
  5125. #if 0
  5126. sortedDumpPage(pDb, pRun, pPg, bVals);
  5127. #endif
  5128. lsmFsDbPageNext(pRun, pPg, 1, &pNext);
  5129. lsmFsPageRelease(pPg);
  5130. pPg = pNext;
  5131. }
  5132. }
  5133. }
  5134. /*
  5135. ** Invoke the log callback zero or more times with messages that describe
  5136. ** the current database structure.
  5137. */
  5138. void lsmSortedDumpStructure(
  5139. lsm_db *pDb, /* Database handle (used for xLog callback) */
  5140. Snapshot *pSnap, /* Snapshot to dump */
  5141. int bKeys, /* Output the keys from each segment */
  5142. int bVals, /* Output the values from each segment */
  5143. const char *zWhy /* Caption to print near top of dump */
  5144. ){
  5145. Snapshot *pDump = pSnap;
  5146. Level *pTopLevel;
  5147. char *zFree = 0;
  5148. assert( pSnap );
  5149. pTopLevel = lsmDbSnapshotLevel(pDump);
  5150. if( pDb->xLog && pTopLevel ){
  5151. static int nCall = 0;
  5152. Level *pLevel;
  5153. int iLevel = 0;
  5154. nCall++;
  5155. lsmLogMessage(pDb, LSM_OK, "Database structure %d (%s)", nCall, zWhy);
  5156. #if 0
  5157. if( nCall==1031 || nCall==1032 ) bKeys=1;
  5158. #endif
  5159. for(pLevel=pTopLevel; pLevel; pLevel=pLevel->pNext){
  5160. char zLeft[1024];
  5161. char zRight[1024];
  5162. int i = 0;
  5163. Segment *aLeft[24];
  5164. Segment *aRight[24];
  5165. int nLeft = 0;
  5166. int nRight = 0;
  5167. Segment *pSeg = &pLevel->lhs;
  5168. aLeft[nLeft++] = pSeg;
  5169. for(i=0; i<pLevel->nRight; i++){
  5170. aRight[nRight++] = &pLevel->aRhs[i];
  5171. }
  5172. #ifdef LSM_LOG_FREELIST
  5173. if( nRight ){
  5174. memmove(&aRight[1], aRight, sizeof(aRight[0])*nRight);
  5175. aRight[0] = 0;
  5176. nRight++;
  5177. }
  5178. #endif
  5179. for(i=0; i<nLeft || i<nRight; i++){
  5180. int iPad = 0;
  5181. char zLevel[32];
  5182. zLeft[0] = '\0';
  5183. zRight[0] = '\0';
  5184. if( i<nLeft ){
  5185. fileToString(pDb, zLeft, sizeof(zLeft), 24, aLeft[i]);
  5186. }
  5187. if( i<nRight ){
  5188. fileToString(pDb, zRight, sizeof(zRight), 24, aRight[i]);
  5189. }
  5190. if( i==0 ){
  5191. snprintf(zLevel, sizeof(zLevel), "L%d: (age=%d) (flags=%.4x)",
  5192. iLevel, (int)pLevel->iAge, (int)pLevel->flags
  5193. );
  5194. }else{
  5195. zLevel[0] = '\0';
  5196. }
  5197. if( nRight==0 ){
  5198. iPad = 10;
  5199. }
  5200. lsmLogMessage(pDb, LSM_OK, "% 25s % *s% -35s %s",
  5201. zLevel, iPad, "", zLeft, zRight
  5202. );
  5203. }
  5204. iLevel++;
  5205. }
  5206. if( bKeys ){
  5207. for(pLevel=pTopLevel; pLevel; pLevel=pLevel->pNext){
  5208. int i;
  5209. sortedDumpSegment(pDb, &pLevel->lhs, bVals);
  5210. for(i=0; i<pLevel->nRight; i++){
  5211. sortedDumpSegment(pDb, &pLevel->aRhs[i], bVals);
  5212. }
  5213. }
  5214. }
  5215. }
  5216. lsmInfoFreelist(pDb, &zFree);
  5217. lsmLogMessage(pDb, LSM_OK, "Freelist: %s", zFree);
  5218. lsmFree(pDb->pEnv, zFree);
  5219. assert( lsmFsIntegrityCheck(pDb) );
  5220. }
  5221. void lsmSortedFreeLevel(lsm_env *pEnv, Level *pLevel){
  5222. Level *pNext;
  5223. Level *p;
  5224. for(p=pLevel; p; p=pNext){
  5225. pNext = p->pNext;
  5226. sortedFreeLevel(pEnv, p);
  5227. }
  5228. }
  5229. void lsmSortedSaveTreeCursors(lsm_db *pDb){
  5230. MultiCursor *pCsr;
  5231. for(pCsr=pDb->pCsr; pCsr; pCsr=pCsr->pNext){
  5232. lsmTreeCursorSave(pCsr->apTreeCsr[0]);
  5233. lsmTreeCursorSave(pCsr->apTreeCsr[1]);
  5234. }
  5235. }
  5236. void lsmSortedExpandBtreePage(Page *pPg, int nOrig){
  5237. u8 *aData;
  5238. int nData;
  5239. int nEntry;
  5240. int iHdr;
  5241. aData = lsmFsPageData(pPg, &nData);
  5242. nEntry = pageGetNRec(aData, nOrig);
  5243. iHdr = SEGMENT_EOF(nOrig, nEntry);
  5244. memmove(&aData[iHdr + (nData-nOrig)], &aData[iHdr], nOrig-iHdr);
  5245. }
  5246. #ifdef LSM_DEBUG_EXPENSIVE
  5247. static void assertRunInOrder(lsm_db *pDb, Segment *pSeg){
  5248. Page *pPg = 0;
  5249. LsmBlob blob1 = {0, 0, 0, 0};
  5250. LsmBlob blob2 = {0, 0, 0, 0};
  5251. lsmFsDbPageGet(pDb->pFS, pSeg, pSeg->iFirst, &pPg);
  5252. while( pPg ){
  5253. u8 *aData; int nData;
  5254. Page *pNext;
  5255. aData = lsmFsPageData(pPg, &nData);
  5256. if( 0==(pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG) ){
  5257. int i;
  5258. int nRec = pageGetNRec(aData, nData);
  5259. for(i=0; i<nRec; i++){
  5260. int iTopic1, iTopic2;
  5261. pageGetKeyCopy(pDb->pEnv, pSeg, pPg, i, &iTopic1, &blob1);
  5262. if( i==0 && blob2.nData ){
  5263. assert( sortedKeyCompare(
  5264. pDb->xCmp, iTopic2, blob2.pData, blob2.nData,
  5265. iTopic1, blob1.pData, blob1.nData
  5266. )<0 );
  5267. }
  5268. if( i<(nRec-1) ){
  5269. pageGetKeyCopy(pDb->pEnv, pSeg, pPg, i+1, &iTopic2, &blob2);
  5270. assert( sortedKeyCompare(
  5271. pDb->xCmp, iTopic1, blob1.pData, blob1.nData,
  5272. iTopic2, blob2.pData, blob2.nData
  5273. )<0 );
  5274. }
  5275. }
  5276. }
  5277. lsmFsDbPageNext(pSeg, pPg, 1, &pNext);
  5278. lsmFsPageRelease(pPg);
  5279. pPg = pNext;
  5280. }
  5281. sortedBlobFree(&blob1);
  5282. sortedBlobFree(&blob2);
  5283. }
  5284. #endif
  5285. #ifdef LSM_DEBUG_EXPENSIVE
  5286. /*
  5287. ** This function is only included in the build if LSM_DEBUG_EXPENSIVE is
  5288. ** defined. Its only purpose is to evaluate various assert() statements to
  5289. ** verify that the database is well formed in certain respects.
  5290. **
  5291. ** More specifically, it checks that the array pOne contains the required
  5292. ** pointers to pTwo. Array pTwo must be a main array. pOne may be either a
  5293. ** separators array or another main array. If pOne does not contain the
  5294. ** correct set of pointers, an assert() statement fails.
  5295. */
  5296. static int assertPointersOk(
  5297. lsm_db *pDb, /* Database handle */
  5298. Segment *pOne, /* Segment containing pointers */
  5299. Segment *pTwo, /* Segment containing pointer targets */
  5300. int bRhs /* True if pTwo may have been Gobble()d */
  5301. ){
  5302. int rc = LSM_OK; /* Error code */
  5303. SegmentPtr ptr1; /* Iterates through pOne */
  5304. SegmentPtr ptr2; /* Iterates through pTwo */
  5305. LsmPgno iPrev;
  5306. assert( pOne && pTwo );
  5307. memset(&ptr1, 0, sizeof(ptr1));
  5308. memset(&ptr2, 0, sizeof(ptr1));
  5309. ptr1.pSeg = pOne;
  5310. ptr2.pSeg = pTwo;
  5311. segmentPtrEndPage(pDb->pFS, &ptr1, 0, &rc);
  5312. segmentPtrEndPage(pDb->pFS, &ptr2, 0, &rc);
  5313. /* Check that the footer pointer of the first page of pOne points to
  5314. ** the first page of pTwo. */
  5315. iPrev = pTwo->iFirst;
  5316. if( ptr1.iPtr!=iPrev && !bRhs ){
  5317. assert( 0 );
  5318. }
  5319. if( rc==LSM_OK && ptr1.nCell>0 ){
  5320. rc = segmentPtrLoadCell(&ptr1, 0);
  5321. }
  5322. while( rc==LSM_OK && ptr2.pPg ){
  5323. LsmPgno iThis;
  5324. /* Advance to the next page of segment pTwo that contains at least
  5325. ** one cell. Break out of the loop if the iterator reaches EOF. */
  5326. do{
  5327. rc = segmentPtrNextPage(&ptr2, 1);
  5328. assert( rc==LSM_OK );
  5329. }while( rc==LSM_OK && ptr2.pPg && ptr2.nCell==0 );
  5330. if( rc!=LSM_OK || ptr2.pPg==0 ) break;
  5331. iThis = lsmFsPageNumber(ptr2.pPg);
  5332. if( (ptr2.flags & (PGFTR_SKIP_THIS_FLAG|SEGMENT_BTREE_FLAG))==0 ){
  5333. /* Load the first cell in the array pTwo page. */
  5334. rc = segmentPtrLoadCell(&ptr2, 0);
  5335. /* Iterate forwards through pOne, searching for a key that matches the
  5336. ** key ptr2.pKey/nKey. This key should have a pointer to the page that
  5337. ** ptr2 currently points to. */
  5338. while( rc==LSM_OK ){
  5339. int res = rtTopic(ptr1.eType) - rtTopic(ptr2.eType);
  5340. if( res==0 ){
  5341. res = pDb->xCmp(ptr1.pKey, ptr1.nKey, ptr2.pKey, ptr2.nKey);
  5342. }
  5343. if( res<0 ){
  5344. assert( bRhs || ptr1.iPtr+ptr1.iPgPtr==iPrev );
  5345. }else if( res>0 ){
  5346. assert( 0 );
  5347. }else{
  5348. assert( ptr1.iPtr+ptr1.iPgPtr==iThis );
  5349. iPrev = iThis;
  5350. break;
  5351. }
  5352. rc = segmentPtrAdvance(0, &ptr1, 0);
  5353. if( ptr1.pPg==0 ){
  5354. assert( 0 );
  5355. }
  5356. }
  5357. }
  5358. }
  5359. segmentPtrReset(&ptr1, 0);
  5360. segmentPtrReset(&ptr2, 0);
  5361. return LSM_OK;
  5362. }
  5363. /*
  5364. ** This function is only included in the build if LSM_DEBUG_EXPENSIVE is
  5365. ** defined. Its only purpose is to evaluate various assert() statements to
  5366. ** verify that the database is well formed in certain respects.
  5367. **
  5368. ** More specifically, it checks that the b-tree embedded in array pRun
  5369. ** contains the correct keys. If not, an assert() fails.
  5370. */
  5371. static int assertBtreeOk(
  5372. lsm_db *pDb,
  5373. Segment *pSeg
  5374. ){
  5375. int rc = LSM_OK; /* Return code */
  5376. if( pSeg->iRoot ){
  5377. LsmBlob blob = {0, 0, 0}; /* Buffer used to cache overflow keys */
  5378. FileSystem *pFS = pDb->pFS; /* File system to read from */
  5379. Page *pPg = 0; /* Main run page */
  5380. BtreeCursor *pCsr = 0; /* Btree cursor */
  5381. rc = btreeCursorNew(pDb, pSeg, &pCsr);
  5382. if( rc==LSM_OK ){
  5383. rc = btreeCursorFirst(pCsr);
  5384. }
  5385. if( rc==LSM_OK ){
  5386. rc = lsmFsDbPageGet(pFS, pSeg, pSeg->iFirst, &pPg);
  5387. }
  5388. while( rc==LSM_OK ){
  5389. Page *pNext;
  5390. u8 *aData;
  5391. int nData;
  5392. int flags;
  5393. rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext);
  5394. lsmFsPageRelease(pPg);
  5395. pPg = pNext;
  5396. if( pPg==0 ) break;
  5397. aData = fsPageData(pPg, &nData);
  5398. flags = pageGetFlags(aData, nData);
  5399. if( rc==LSM_OK
  5400. && 0==((SEGMENT_BTREE_FLAG|PGFTR_SKIP_THIS_FLAG) & flags)
  5401. && 0!=pageGetNRec(aData, nData)
  5402. ){
  5403. u8 *pKey;
  5404. int nKey;
  5405. int iTopic;
  5406. pKey = pageGetKey(pSeg, pPg, 0, &iTopic, &nKey, &blob);
  5407. assert( nKey==pCsr->nKey && 0==memcmp(pKey, pCsr->pKey, nKey) );
  5408. assert( lsmFsPageNumber(pPg)==pCsr->iPtr );
  5409. rc = btreeCursorNext(pCsr);
  5410. }
  5411. }
  5412. assert( rc!=LSM_OK || pCsr->pKey==0 );
  5413. if( pPg ) lsmFsPageRelease(pPg);
  5414. btreeCursorFree(pCsr);
  5415. sortedBlobFree(&blob);
  5416. }
  5417. return rc;
  5418. }
  5419. #endif /* ifdef LSM_DEBUG_EXPENSIVE */