fib_trie.c 61 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634
  1. /*
  2. * This program is free software; you can redistribute it and/or
  3. * modify it under the terms of the GNU General Public License
  4. * as published by the Free Software Foundation; either version
  5. * 2 of the License, or (at your option) any later version.
  6. *
  7. * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
  8. * & Swedish University of Agricultural Sciences.
  9. *
  10. * Jens Laas <jens.laas@data.slu.se> Swedish University of
  11. * Agricultural Sciences.
  12. *
  13. * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
  14. *
  15. * This work is based on the LPC-trie which is originally described in:
  16. *
  17. * An experimental study of compression methods for dynamic tries
  18. * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  19. * http://www.csc.kth.se/~snilsson/software/dyntrie2/
  20. *
  21. *
  22. * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  23. * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  24. *
  25. *
  26. * Code from fib_hash has been reused which includes the following header:
  27. *
  28. *
  29. * INET An implementation of the TCP/IP protocol suite for the LINUX
  30. * operating system. INET is implemented using the BSD Socket
  31. * interface as the means of communication with the user level.
  32. *
  33. * IPv4 FIB: lookup engine and maintenance routines.
  34. *
  35. *
  36. * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  37. *
  38. * This program is free software; you can redistribute it and/or
  39. * modify it under the terms of the GNU General Public License
  40. * as published by the Free Software Foundation; either version
  41. * 2 of the License, or (at your option) any later version.
  42. *
  43. * Substantial contributions to this work comes from:
  44. *
  45. * David S. Miller, <davem@davemloft.net>
  46. * Stephen Hemminger <shemminger@osdl.org>
  47. * Paul E. McKenney <paulmck@us.ibm.com>
  48. * Patrick McHardy <kaber@trash.net>
  49. */
  50. #define VERSION "0.409"
  51. #include <asm/uaccess.h>
  52. #include <linux/bitops.h>
  53. #include <linux/types.h>
  54. #include <linux/kernel.h>
  55. #include <linux/mm.h>
  56. #include <linux/string.h>
  57. #include <linux/socket.h>
  58. #include <linux/sockios.h>
  59. #include <linux/errno.h>
  60. #include <linux/in.h>
  61. #include <linux/inet.h>
  62. #include <linux/inetdevice.h>
  63. #include <linux/netdevice.h>
  64. #include <linux/if_arp.h>
  65. #include <linux/proc_fs.h>
  66. #include <linux/rcupdate.h>
  67. #include <linux/skbuff.h>
  68. #include <linux/netlink.h>
  69. #include <linux/init.h>
  70. #include <linux/list.h>
  71. #include <linux/slab.h>
  72. #include <linux/export.h>
  73. #include <net/net_namespace.h>
  74. #include <net/ip.h>
  75. #include <net/protocol.h>
  76. #include <net/route.h>
  77. #include <net/tcp.h>
  78. #include <net/sock.h>
  79. #include <net/ip_fib.h>
  80. #include "fib_lookup.h"
  81. #define MAX_STAT_DEPTH 32
  82. #define KEYLENGTH (8*sizeof(t_key))
  83. typedef unsigned int t_key;
  84. #define T_TNODE 0
  85. #define T_LEAF 1
  86. #define NODE_TYPE_MASK 0x1UL
  87. #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
  88. #define IS_TNODE(n) (!(n->parent & T_LEAF))
  89. #define IS_LEAF(n) (n->parent & T_LEAF)
  90. struct rt_trie_node {
  91. unsigned long parent;
  92. t_key key;
  93. };
  94. struct leaf {
  95. unsigned long parent;
  96. t_key key;
  97. struct hlist_head list;
  98. struct rcu_head rcu;
  99. };
  100. struct leaf_info {
  101. struct hlist_node hlist;
  102. int plen;
  103. u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
  104. struct list_head falh;
  105. struct rcu_head rcu;
  106. };
  107. struct tnode {
  108. unsigned long parent;
  109. t_key key;
  110. unsigned char pos; /* 2log(KEYLENGTH) bits needed */
  111. unsigned char bits; /* 2log(KEYLENGTH) bits needed */
  112. unsigned int full_children; /* KEYLENGTH bits needed */
  113. unsigned int empty_children; /* KEYLENGTH bits needed */
  114. union {
  115. struct rcu_head rcu;
  116. struct work_struct work;
  117. struct tnode *tnode_free;
  118. };
  119. struct rt_trie_node __rcu *child[0];
  120. };
  121. #ifdef CONFIG_IP_FIB_TRIE_STATS
  122. struct trie_use_stats {
  123. unsigned int gets;
  124. unsigned int backtrack;
  125. unsigned int semantic_match_passed;
  126. unsigned int semantic_match_miss;
  127. unsigned int null_node_hit;
  128. unsigned int resize_node_skipped;
  129. };
  130. #endif
  131. struct trie_stat {
  132. unsigned int totdepth;
  133. unsigned int maxdepth;
  134. unsigned int tnodes;
  135. unsigned int leaves;
  136. unsigned int nullpointers;
  137. unsigned int prefixes;
  138. unsigned int nodesizes[MAX_STAT_DEPTH];
  139. };
  140. struct trie {
  141. struct rt_trie_node __rcu *trie;
  142. #ifdef CONFIG_IP_FIB_TRIE_STATS
  143. struct trie_use_stats stats;
  144. #endif
  145. };
  146. static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
  147. static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
  148. int wasfull);
  149. static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
  150. static struct tnode *inflate(struct trie *t, struct tnode *tn);
  151. static struct tnode *halve(struct trie *t, struct tnode *tn);
  152. /* tnodes to free after resize(); protected by RTNL */
  153. static struct tnode *tnode_free_head;
  154. static size_t tnode_free_size;
  155. /*
  156. * synchronize_rcu after call_rcu for that many pages; it should be especially
  157. * useful before resizing the root node with PREEMPT_NONE configs; the value was
  158. * obtained experimentally, aiming to avoid visible slowdown.
  159. */
  160. static const int sync_pages = 128;
  161. static struct kmem_cache *fn_alias_kmem __read_mostly;
  162. static struct kmem_cache *trie_leaf_kmem __read_mostly;
  163. /*
  164. * caller must hold RTNL
  165. */
  166. static inline struct tnode *node_parent(const struct rt_trie_node *node)
  167. {
  168. unsigned long parent;
  169. parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
  170. return (struct tnode *)(parent & ~NODE_TYPE_MASK);
  171. }
  172. /*
  173. * caller must hold RCU read lock or RTNL
  174. */
  175. static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
  176. {
  177. unsigned long parent;
  178. parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
  179. lockdep_rtnl_is_held());
  180. return (struct tnode *)(parent & ~NODE_TYPE_MASK);
  181. }
  182. /* Same as rcu_assign_pointer
  183. * but that macro() assumes that value is a pointer.
  184. */
  185. static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
  186. {
  187. smp_wmb();
  188. node->parent = (unsigned long)ptr | NODE_TYPE(node);
  189. }
  190. /*
  191. * caller must hold RTNL
  192. */
  193. static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
  194. {
  195. BUG_ON(i >= 1U << tn->bits);
  196. return rtnl_dereference(tn->child[i]);
  197. }
  198. /*
  199. * caller must hold RCU read lock or RTNL
  200. */
  201. static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
  202. {
  203. BUG_ON(i >= 1U << tn->bits);
  204. return rcu_dereference_rtnl(tn->child[i]);
  205. }
  206. static inline int tnode_child_length(const struct tnode *tn)
  207. {
  208. return 1 << tn->bits;
  209. }
  210. static inline t_key mask_pfx(t_key k, unsigned int l)
  211. {
  212. return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
  213. }
  214. static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
  215. {
  216. if (offset < KEYLENGTH)
  217. return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
  218. else
  219. return 0;
  220. }
  221. static inline int tkey_equals(t_key a, t_key b)
  222. {
  223. return a == b;
  224. }
  225. static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
  226. {
  227. if (bits == 0 || offset >= KEYLENGTH)
  228. return 1;
  229. bits = bits > KEYLENGTH ? KEYLENGTH : bits;
  230. return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
  231. }
  232. static inline int tkey_mismatch(t_key a, int offset, t_key b)
  233. {
  234. t_key diff = a ^ b;
  235. int i = offset;
  236. if (!diff)
  237. return 0;
  238. while ((diff << i) >> (KEYLENGTH-1) == 0)
  239. i++;
  240. return i;
  241. }
  242. /*
  243. To understand this stuff, an understanding of keys and all their bits is
  244. necessary. Every node in the trie has a key associated with it, but not
  245. all of the bits in that key are significant.
  246. Consider a node 'n' and its parent 'tp'.
  247. If n is a leaf, every bit in its key is significant. Its presence is
  248. necessitated by path compression, since during a tree traversal (when
  249. searching for a leaf - unless we are doing an insertion) we will completely
  250. ignore all skipped bits we encounter. Thus we need to verify, at the end of
  251. a potentially successful search, that we have indeed been walking the
  252. correct key path.
  253. Note that we can never "miss" the correct key in the tree if present by
  254. following the wrong path. Path compression ensures that segments of the key
  255. that are the same for all keys with a given prefix are skipped, but the
  256. skipped part *is* identical for each node in the subtrie below the skipped
  257. bit! trie_insert() in this implementation takes care of that - note the
  258. call to tkey_sub_equals() in trie_insert().
  259. if n is an internal node - a 'tnode' here, the various parts of its key
  260. have many different meanings.
  261. Example:
  262. _________________________________________________________________
  263. | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
  264. -----------------------------------------------------------------
  265. 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
  266. _________________________________________________________________
  267. | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
  268. -----------------------------------------------------------------
  269. 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
  270. tp->pos = 7
  271. tp->bits = 3
  272. n->pos = 15
  273. n->bits = 4
  274. First, let's just ignore the bits that come before the parent tp, that is
  275. the bits from 0 to (tp->pos-1). They are *known* but at this point we do
  276. not use them for anything.
  277. The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
  278. index into the parent's child array. That is, they will be used to find
  279. 'n' among tp's children.
  280. The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
  281. for the node n.
  282. All the bits we have seen so far are significant to the node n. The rest
  283. of the bits are really not needed or indeed known in n->key.
  284. The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
  285. n's child array, and will of course be different for each child.
  286. The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
  287. at this point.
  288. */
  289. static inline void check_tnode(const struct tnode *tn)
  290. {
  291. WARN_ON(tn && tn->pos+tn->bits > 32);
  292. }
  293. static const int halve_threshold = 25;
  294. static const int inflate_threshold = 50;
  295. static const int halve_threshold_root = 15;
  296. static const int inflate_threshold_root = 30;
  297. static void __alias_free_mem(struct rcu_head *head)
  298. {
  299. struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
  300. kmem_cache_free(fn_alias_kmem, fa);
  301. }
  302. static inline void alias_free_mem_rcu(struct fib_alias *fa)
  303. {
  304. call_rcu(&fa->rcu, __alias_free_mem);
  305. }
  306. static void __leaf_free_rcu(struct rcu_head *head)
  307. {
  308. struct leaf *l = container_of(head, struct leaf, rcu);
  309. kmem_cache_free(trie_leaf_kmem, l);
  310. }
  311. static inline void free_leaf(struct leaf *l)
  312. {
  313. call_rcu(&l->rcu, __leaf_free_rcu);
  314. }
  315. static inline void free_leaf_info(struct leaf_info *leaf)
  316. {
  317. kfree_rcu(leaf, rcu);
  318. }
  319. static struct tnode *tnode_alloc(size_t size)
  320. {
  321. if (size <= PAGE_SIZE)
  322. return kzalloc(size, GFP_KERNEL);
  323. else
  324. return vzalloc(size);
  325. }
  326. static void __tnode_vfree(struct work_struct *arg)
  327. {
  328. struct tnode *tn = container_of(arg, struct tnode, work);
  329. vfree(tn);
  330. }
  331. static void __tnode_free_rcu(struct rcu_head *head)
  332. {
  333. struct tnode *tn = container_of(head, struct tnode, rcu);
  334. size_t size = sizeof(struct tnode) +
  335. (sizeof(struct rt_trie_node *) << tn->bits);
  336. if (size <= PAGE_SIZE)
  337. kfree(tn);
  338. else {
  339. INIT_WORK(&tn->work, __tnode_vfree);
  340. schedule_work(&tn->work);
  341. }
  342. }
  343. static inline void tnode_free(struct tnode *tn)
  344. {
  345. if (IS_LEAF(tn))
  346. free_leaf((struct leaf *) tn);
  347. else
  348. call_rcu(&tn->rcu, __tnode_free_rcu);
  349. }
  350. static void tnode_free_safe(struct tnode *tn)
  351. {
  352. BUG_ON(IS_LEAF(tn));
  353. tn->tnode_free = tnode_free_head;
  354. tnode_free_head = tn;
  355. tnode_free_size += sizeof(struct tnode) +
  356. (sizeof(struct rt_trie_node *) << tn->bits);
  357. }
  358. static void tnode_free_flush(void)
  359. {
  360. struct tnode *tn;
  361. while ((tn = tnode_free_head)) {
  362. tnode_free_head = tn->tnode_free;
  363. tn->tnode_free = NULL;
  364. tnode_free(tn);
  365. }
  366. if (tnode_free_size >= PAGE_SIZE * sync_pages) {
  367. tnode_free_size = 0;
  368. synchronize_rcu();
  369. }
  370. }
  371. static struct leaf *leaf_new(void)
  372. {
  373. struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
  374. if (l) {
  375. l->parent = T_LEAF;
  376. INIT_HLIST_HEAD(&l->list);
  377. }
  378. return l;
  379. }
  380. static struct leaf_info *leaf_info_new(int plen)
  381. {
  382. struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
  383. if (li) {
  384. li->plen = plen;
  385. li->mask_plen = ntohl(inet_make_mask(plen));
  386. INIT_LIST_HEAD(&li->falh);
  387. }
  388. return li;
  389. }
  390. static struct tnode *tnode_new(t_key key, int pos, int bits)
  391. {
  392. size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
  393. struct tnode *tn = tnode_alloc(sz);
  394. if (tn) {
  395. tn->parent = T_TNODE;
  396. tn->pos = pos;
  397. tn->bits = bits;
  398. tn->key = key;
  399. tn->full_children = 0;
  400. tn->empty_children = 1<<bits;
  401. }
  402. pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
  403. sizeof(struct rt_trie_node) << bits);
  404. return tn;
  405. }
  406. /*
  407. * Check whether a tnode 'n' is "full", i.e. it is an internal node
  408. * and no bits are skipped. See discussion in dyntree paper p. 6
  409. */
  410. static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
  411. {
  412. if (n == NULL || IS_LEAF(n))
  413. return 0;
  414. return ((struct tnode *) n)->pos == tn->pos + tn->bits;
  415. }
  416. static inline void put_child(struct trie *t, struct tnode *tn, int i,
  417. struct rt_trie_node *n)
  418. {
  419. tnode_put_child_reorg(tn, i, n, -1);
  420. }
  421. /*
  422. * Add a child at position i overwriting the old value.
  423. * Update the value of full_children and empty_children.
  424. */
  425. static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
  426. int wasfull)
  427. {
  428. struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
  429. int isfull;
  430. BUG_ON(i >= 1<<tn->bits);
  431. /* update emptyChildren */
  432. if (n == NULL && chi != NULL)
  433. tn->empty_children++;
  434. else if (n != NULL && chi == NULL)
  435. tn->empty_children--;
  436. /* update fullChildren */
  437. if (wasfull == -1)
  438. wasfull = tnode_full(tn, chi);
  439. isfull = tnode_full(tn, n);
  440. if (wasfull && !isfull)
  441. tn->full_children--;
  442. else if (!wasfull && isfull)
  443. tn->full_children++;
  444. if (n)
  445. node_set_parent(n, tn);
  446. rcu_assign_pointer(tn->child[i], n);
  447. }
  448. #define MAX_WORK 10
  449. static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
  450. {
  451. int i;
  452. struct tnode *old_tn;
  453. int inflate_threshold_use;
  454. int halve_threshold_use;
  455. int max_work;
  456. if (!tn)
  457. return NULL;
  458. pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
  459. tn, inflate_threshold, halve_threshold);
  460. /* No children */
  461. if (tn->empty_children == tnode_child_length(tn)) {
  462. tnode_free_safe(tn);
  463. return NULL;
  464. }
  465. /* One child */
  466. if (tn->empty_children == tnode_child_length(tn) - 1)
  467. goto one_child;
  468. /*
  469. * Double as long as the resulting node has a number of
  470. * nonempty nodes that are above the threshold.
  471. */
  472. /*
  473. * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
  474. * the Helsinki University of Technology and Matti Tikkanen of Nokia
  475. * Telecommunications, page 6:
  476. * "A node is doubled if the ratio of non-empty children to all
  477. * children in the *doubled* node is at least 'high'."
  478. *
  479. * 'high' in this instance is the variable 'inflate_threshold'. It
  480. * is expressed as a percentage, so we multiply it with
  481. * tnode_child_length() and instead of multiplying by 2 (since the
  482. * child array will be doubled by inflate()) and multiplying
  483. * the left-hand side by 100 (to handle the percentage thing) we
  484. * multiply the left-hand side by 50.
  485. *
  486. * The left-hand side may look a bit weird: tnode_child_length(tn)
  487. * - tn->empty_children is of course the number of non-null children
  488. * in the current node. tn->full_children is the number of "full"
  489. * children, that is non-null tnodes with a skip value of 0.
  490. * All of those will be doubled in the resulting inflated tnode, so
  491. * we just count them one extra time here.
  492. *
  493. * A clearer way to write this would be:
  494. *
  495. * to_be_doubled = tn->full_children;
  496. * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
  497. * tn->full_children;
  498. *
  499. * new_child_length = tnode_child_length(tn) * 2;
  500. *
  501. * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
  502. * new_child_length;
  503. * if (new_fill_factor >= inflate_threshold)
  504. *
  505. * ...and so on, tho it would mess up the while () loop.
  506. *
  507. * anyway,
  508. * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
  509. * inflate_threshold
  510. *
  511. * avoid a division:
  512. * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
  513. * inflate_threshold * new_child_length
  514. *
  515. * expand not_to_be_doubled and to_be_doubled, and shorten:
  516. * 100 * (tnode_child_length(tn) - tn->empty_children +
  517. * tn->full_children) >= inflate_threshold * new_child_length
  518. *
  519. * expand new_child_length:
  520. * 100 * (tnode_child_length(tn) - tn->empty_children +
  521. * tn->full_children) >=
  522. * inflate_threshold * tnode_child_length(tn) * 2
  523. *
  524. * shorten again:
  525. * 50 * (tn->full_children + tnode_child_length(tn) -
  526. * tn->empty_children) >= inflate_threshold *
  527. * tnode_child_length(tn)
  528. *
  529. */
  530. check_tnode(tn);
  531. /* Keep root node larger */
  532. if (!node_parent((struct rt_trie_node *)tn)) {
  533. inflate_threshold_use = inflate_threshold_root;
  534. halve_threshold_use = halve_threshold_root;
  535. } else {
  536. inflate_threshold_use = inflate_threshold;
  537. halve_threshold_use = halve_threshold;
  538. }
  539. max_work = MAX_WORK;
  540. while ((tn->full_children > 0 && max_work-- &&
  541. 50 * (tn->full_children + tnode_child_length(tn)
  542. - tn->empty_children)
  543. >= inflate_threshold_use * tnode_child_length(tn))) {
  544. old_tn = tn;
  545. tn = inflate(t, tn);
  546. if (IS_ERR(tn)) {
  547. tn = old_tn;
  548. #ifdef CONFIG_IP_FIB_TRIE_STATS
  549. t->stats.resize_node_skipped++;
  550. #endif
  551. break;
  552. }
  553. }
  554. check_tnode(tn);
  555. /* Return if at least one inflate is run */
  556. if (max_work != MAX_WORK)
  557. return (struct rt_trie_node *) tn;
  558. /*
  559. * Halve as long as the number of empty children in this
  560. * node is above threshold.
  561. */
  562. max_work = MAX_WORK;
  563. while (tn->bits > 1 && max_work-- &&
  564. 100 * (tnode_child_length(tn) - tn->empty_children) <
  565. halve_threshold_use * tnode_child_length(tn)) {
  566. old_tn = tn;
  567. tn = halve(t, tn);
  568. if (IS_ERR(tn)) {
  569. tn = old_tn;
  570. #ifdef CONFIG_IP_FIB_TRIE_STATS
  571. t->stats.resize_node_skipped++;
  572. #endif
  573. break;
  574. }
  575. }
  576. /* Only one child remains */
  577. if (tn->empty_children == tnode_child_length(tn) - 1) {
  578. one_child:
  579. for (i = 0; i < tnode_child_length(tn); i++) {
  580. struct rt_trie_node *n;
  581. n = rtnl_dereference(tn->child[i]);
  582. if (!n)
  583. continue;
  584. /* compress one level */
  585. node_set_parent(n, NULL);
  586. tnode_free_safe(tn);
  587. return n;
  588. }
  589. }
  590. return (struct rt_trie_node *) tn;
  591. }
  592. static void tnode_clean_free(struct tnode *tn)
  593. {
  594. int i;
  595. struct tnode *tofree;
  596. for (i = 0; i < tnode_child_length(tn); i++) {
  597. tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
  598. if (tofree)
  599. tnode_free(tofree);
  600. }
  601. tnode_free(tn);
  602. }
  603. static struct tnode *inflate(struct trie *t, struct tnode *tn)
  604. {
  605. struct tnode *oldtnode = tn;
  606. int olen = tnode_child_length(tn);
  607. int i;
  608. pr_debug("In inflate\n");
  609. tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
  610. if (!tn)
  611. return ERR_PTR(-ENOMEM);
  612. /*
  613. * Preallocate and store tnodes before the actual work so we
  614. * don't get into an inconsistent state if memory allocation
  615. * fails. In case of failure we return the oldnode and inflate
  616. * of tnode is ignored.
  617. */
  618. for (i = 0; i < olen; i++) {
  619. struct tnode *inode;
  620. inode = (struct tnode *) tnode_get_child(oldtnode, i);
  621. if (inode &&
  622. IS_TNODE(inode) &&
  623. inode->pos == oldtnode->pos + oldtnode->bits &&
  624. inode->bits > 1) {
  625. struct tnode *left, *right;
  626. t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
  627. left = tnode_new(inode->key&(~m), inode->pos + 1,
  628. inode->bits - 1);
  629. if (!left)
  630. goto nomem;
  631. right = tnode_new(inode->key|m, inode->pos + 1,
  632. inode->bits - 1);
  633. if (!right) {
  634. tnode_free(left);
  635. goto nomem;
  636. }
  637. put_child(t, tn, 2*i, (struct rt_trie_node *) left);
  638. put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
  639. }
  640. }
  641. for (i = 0; i < olen; i++) {
  642. struct tnode *inode;
  643. struct rt_trie_node *node = tnode_get_child(oldtnode, i);
  644. struct tnode *left, *right;
  645. int size, j;
  646. /* An empty child */
  647. if (node == NULL)
  648. continue;
  649. /* A leaf or an internal node with skipped bits */
  650. if (IS_LEAF(node) || ((struct tnode *) node)->pos >
  651. tn->pos + tn->bits - 1) {
  652. if (tkey_extract_bits(node->key,
  653. oldtnode->pos + oldtnode->bits,
  654. 1) == 0)
  655. put_child(t, tn, 2*i, node);
  656. else
  657. put_child(t, tn, 2*i+1, node);
  658. continue;
  659. }
  660. /* An internal node with two children */
  661. inode = (struct tnode *) node;
  662. if (inode->bits == 1) {
  663. put_child(t, tn, 2*i, rtnl_dereference(inode->child[0]));
  664. put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1]));
  665. tnode_free_safe(inode);
  666. continue;
  667. }
  668. /* An internal node with more than two children */
  669. /* We will replace this node 'inode' with two new
  670. * ones, 'left' and 'right', each with half of the
  671. * original children. The two new nodes will have
  672. * a position one bit further down the key and this
  673. * means that the "significant" part of their keys
  674. * (see the discussion near the top of this file)
  675. * will differ by one bit, which will be "0" in
  676. * left's key and "1" in right's key. Since we are
  677. * moving the key position by one step, the bit that
  678. * we are moving away from - the bit at position
  679. * (inode->pos) - is the one that will differ between
  680. * left and right. So... we synthesize that bit in the
  681. * two new keys.
  682. * The mask 'm' below will be a single "one" bit at
  683. * the position (inode->pos)
  684. */
  685. /* Use the old key, but set the new significant
  686. * bit to zero.
  687. */
  688. left = (struct tnode *) tnode_get_child(tn, 2*i);
  689. put_child(t, tn, 2*i, NULL);
  690. BUG_ON(!left);
  691. right = (struct tnode *) tnode_get_child(tn, 2*i+1);
  692. put_child(t, tn, 2*i+1, NULL);
  693. BUG_ON(!right);
  694. size = tnode_child_length(left);
  695. for (j = 0; j < size; j++) {
  696. put_child(t, left, j, rtnl_dereference(inode->child[j]));
  697. put_child(t, right, j, rtnl_dereference(inode->child[j + size]));
  698. }
  699. put_child(t, tn, 2*i, resize(t, left));
  700. put_child(t, tn, 2*i+1, resize(t, right));
  701. tnode_free_safe(inode);
  702. }
  703. tnode_free_safe(oldtnode);
  704. return tn;
  705. nomem:
  706. tnode_clean_free(tn);
  707. return ERR_PTR(-ENOMEM);
  708. }
  709. static struct tnode *halve(struct trie *t, struct tnode *tn)
  710. {
  711. struct tnode *oldtnode = tn;
  712. struct rt_trie_node *left, *right;
  713. int i;
  714. int olen = tnode_child_length(tn);
  715. pr_debug("In halve\n");
  716. tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
  717. if (!tn)
  718. return ERR_PTR(-ENOMEM);
  719. /*
  720. * Preallocate and store tnodes before the actual work so we
  721. * don't get into an inconsistent state if memory allocation
  722. * fails. In case of failure we return the oldnode and halve
  723. * of tnode is ignored.
  724. */
  725. for (i = 0; i < olen; i += 2) {
  726. left = tnode_get_child(oldtnode, i);
  727. right = tnode_get_child(oldtnode, i+1);
  728. /* Two nonempty children */
  729. if (left && right) {
  730. struct tnode *newn;
  731. newn = tnode_new(left->key, tn->pos + tn->bits, 1);
  732. if (!newn)
  733. goto nomem;
  734. put_child(t, tn, i/2, (struct rt_trie_node *)newn);
  735. }
  736. }
  737. for (i = 0; i < olen; i += 2) {
  738. struct tnode *newBinNode;
  739. left = tnode_get_child(oldtnode, i);
  740. right = tnode_get_child(oldtnode, i+1);
  741. /* At least one of the children is empty */
  742. if (left == NULL) {
  743. if (right == NULL) /* Both are empty */
  744. continue;
  745. put_child(t, tn, i/2, right);
  746. continue;
  747. }
  748. if (right == NULL) {
  749. put_child(t, tn, i/2, left);
  750. continue;
  751. }
  752. /* Two nonempty children */
  753. newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
  754. put_child(t, tn, i/2, NULL);
  755. put_child(t, newBinNode, 0, left);
  756. put_child(t, newBinNode, 1, right);
  757. put_child(t, tn, i/2, resize(t, newBinNode));
  758. }
  759. tnode_free_safe(oldtnode);
  760. return tn;
  761. nomem:
  762. tnode_clean_free(tn);
  763. return ERR_PTR(-ENOMEM);
  764. }
  765. /* readside must use rcu_read_lock currently dump routines
  766. via get_fa_head and dump */
  767. static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
  768. {
  769. struct hlist_head *head = &l->list;
  770. struct hlist_node *node;
  771. struct leaf_info *li;
  772. hlist_for_each_entry_rcu(li, node, head, hlist)
  773. if (li->plen == plen)
  774. return li;
  775. return NULL;
  776. }
  777. static inline struct list_head *get_fa_head(struct leaf *l, int plen)
  778. {
  779. struct leaf_info *li = find_leaf_info(l, plen);
  780. if (!li)
  781. return NULL;
  782. return &li->falh;
  783. }
  784. static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
  785. {
  786. struct leaf_info *li = NULL, *last = NULL;
  787. struct hlist_node *node;
  788. if (hlist_empty(head)) {
  789. hlist_add_head_rcu(&new->hlist, head);
  790. } else {
  791. hlist_for_each_entry(li, node, head, hlist) {
  792. if (new->plen > li->plen)
  793. break;
  794. last = li;
  795. }
  796. if (last)
  797. hlist_add_after_rcu(&last->hlist, &new->hlist);
  798. else
  799. hlist_add_before_rcu(&new->hlist, &li->hlist);
  800. }
  801. }
  802. /* rcu_read_lock needs to be hold by caller from readside */
  803. static struct leaf *
  804. fib_find_node(struct trie *t, u32 key)
  805. {
  806. int pos;
  807. struct tnode *tn;
  808. struct rt_trie_node *n;
  809. pos = 0;
  810. n = rcu_dereference_rtnl(t->trie);
  811. while (n != NULL && NODE_TYPE(n) == T_TNODE) {
  812. tn = (struct tnode *) n;
  813. check_tnode(tn);
  814. if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
  815. pos = tn->pos + tn->bits;
  816. n = tnode_get_child_rcu(tn,
  817. tkey_extract_bits(key,
  818. tn->pos,
  819. tn->bits));
  820. } else
  821. break;
  822. }
  823. /* Case we have found a leaf. Compare prefixes */
  824. if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
  825. return (struct leaf *)n;
  826. return NULL;
  827. }
  828. static void trie_rebalance(struct trie *t, struct tnode *tn)
  829. {
  830. int wasfull;
  831. t_key cindex, key;
  832. struct tnode *tp;
  833. key = tn->key;
  834. while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
  835. cindex = tkey_extract_bits(key, tp->pos, tp->bits);
  836. wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
  837. tn = (struct tnode *)resize(t, tn);
  838. tnode_put_child_reorg(tp, cindex,
  839. (struct rt_trie_node *)tn, wasfull);
  840. tp = node_parent((struct rt_trie_node *) tn);
  841. if (!tp)
  842. rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
  843. tnode_free_flush();
  844. if (!tp)
  845. break;
  846. tn = tp;
  847. }
  848. /* Handle last (top) tnode */
  849. if (IS_TNODE(tn))
  850. tn = (struct tnode *)resize(t, tn);
  851. rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
  852. tnode_free_flush();
  853. }
  854. /* only used from updater-side */
  855. static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
  856. {
  857. int pos, newpos;
  858. struct tnode *tp = NULL, *tn = NULL;
  859. struct rt_trie_node *n;
  860. struct leaf *l;
  861. int missbit;
  862. struct list_head *fa_head = NULL;
  863. struct leaf_info *li;
  864. t_key cindex;
  865. pos = 0;
  866. n = rtnl_dereference(t->trie);
  867. /* If we point to NULL, stop. Either the tree is empty and we should
  868. * just put a new leaf in if, or we have reached an empty child slot,
  869. * and we should just put our new leaf in that.
  870. * If we point to a T_TNODE, check if it matches our key. Note that
  871. * a T_TNODE might be skipping any number of bits - its 'pos' need
  872. * not be the parent's 'pos'+'bits'!
  873. *
  874. * If it does match the current key, get pos/bits from it, extract
  875. * the index from our key, push the T_TNODE and walk the tree.
  876. *
  877. * If it doesn't, we have to replace it with a new T_TNODE.
  878. *
  879. * If we point to a T_LEAF, it might or might not have the same key
  880. * as we do. If it does, just change the value, update the T_LEAF's
  881. * value, and return it.
  882. * If it doesn't, we need to replace it with a T_TNODE.
  883. */
  884. while (n != NULL && NODE_TYPE(n) == T_TNODE) {
  885. tn = (struct tnode *) n;
  886. check_tnode(tn);
  887. if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
  888. tp = tn;
  889. pos = tn->pos + tn->bits;
  890. n = tnode_get_child(tn,
  891. tkey_extract_bits(key,
  892. tn->pos,
  893. tn->bits));
  894. BUG_ON(n && node_parent(n) != tn);
  895. } else
  896. break;
  897. }
  898. /*
  899. * n ----> NULL, LEAF or TNODE
  900. *
  901. * tp is n's (parent) ----> NULL or TNODE
  902. */
  903. BUG_ON(tp && IS_LEAF(tp));
  904. /* Case 1: n is a leaf. Compare prefixes */
  905. if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
  906. l = (struct leaf *) n;
  907. li = leaf_info_new(plen);
  908. if (!li)
  909. return NULL;
  910. fa_head = &li->falh;
  911. insert_leaf_info(&l->list, li);
  912. goto done;
  913. }
  914. l = leaf_new();
  915. if (!l)
  916. return NULL;
  917. l->key = key;
  918. li = leaf_info_new(plen);
  919. if (!li) {
  920. free_leaf(l);
  921. return NULL;
  922. }
  923. fa_head = &li->falh;
  924. insert_leaf_info(&l->list, li);
  925. if (t->trie && n == NULL) {
  926. /* Case 2: n is NULL, and will just insert a new leaf */
  927. node_set_parent((struct rt_trie_node *)l, tp);
  928. cindex = tkey_extract_bits(key, tp->pos, tp->bits);
  929. put_child(t, tp, cindex, (struct rt_trie_node *)l);
  930. } else {
  931. /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
  932. /*
  933. * Add a new tnode here
  934. * first tnode need some special handling
  935. */
  936. if (tp)
  937. pos = tp->pos+tp->bits;
  938. else
  939. pos = 0;
  940. if (n) {
  941. newpos = tkey_mismatch(key, pos, n->key);
  942. tn = tnode_new(n->key, newpos, 1);
  943. } else {
  944. newpos = 0;
  945. tn = tnode_new(key, newpos, 1); /* First tnode */
  946. }
  947. if (!tn) {
  948. free_leaf_info(li);
  949. free_leaf(l);
  950. return NULL;
  951. }
  952. node_set_parent((struct rt_trie_node *)tn, tp);
  953. missbit = tkey_extract_bits(key, newpos, 1);
  954. put_child(t, tn, missbit, (struct rt_trie_node *)l);
  955. put_child(t, tn, 1-missbit, n);
  956. if (tp) {
  957. cindex = tkey_extract_bits(key, tp->pos, tp->bits);
  958. put_child(t, tp, cindex, (struct rt_trie_node *)tn);
  959. } else {
  960. rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
  961. tp = tn;
  962. }
  963. }
  964. if (tp && tp->pos + tp->bits > 32)
  965. pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
  966. tp, tp->pos, tp->bits, key, plen);
  967. /* Rebalance the trie */
  968. trie_rebalance(t, tp);
  969. done:
  970. return fa_head;
  971. }
  972. /*
  973. * Caller must hold RTNL.
  974. */
  975. int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
  976. {
  977. struct trie *t = (struct trie *) tb->tb_data;
  978. struct fib_alias *fa, *new_fa;
  979. struct list_head *fa_head = NULL;
  980. struct fib_info *fi;
  981. int plen = cfg->fc_dst_len;
  982. u8 tos = cfg->fc_tos;
  983. u32 key, mask;
  984. int err;
  985. struct leaf *l;
  986. if (plen > 32)
  987. return -EINVAL;
  988. key = ntohl(cfg->fc_dst);
  989. pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
  990. mask = ntohl(inet_make_mask(plen));
  991. if (key & ~mask)
  992. return -EINVAL;
  993. key = key & mask;
  994. fi = fib_create_info(cfg);
  995. if (IS_ERR(fi)) {
  996. err = PTR_ERR(fi);
  997. goto err;
  998. }
  999. l = fib_find_node(t, key);
  1000. fa = NULL;
  1001. if (l) {
  1002. fa_head = get_fa_head(l, plen);
  1003. fa = fib_find_alias(fa_head, tos, fi->fib_priority);
  1004. }
  1005. /* Now fa, if non-NULL, points to the first fib alias
  1006. * with the same keys [prefix,tos,priority], if such key already
  1007. * exists or to the node before which we will insert new one.
  1008. *
  1009. * If fa is NULL, we will need to allocate a new one and
  1010. * insert to the head of f.
  1011. *
  1012. * If f is NULL, no fib node matched the destination key
  1013. * and we need to allocate a new one of those as well.
  1014. */
  1015. if (fa && fa->fa_tos == tos &&
  1016. fa->fa_info->fib_priority == fi->fib_priority) {
  1017. struct fib_alias *fa_first, *fa_match;
  1018. err = -EEXIST;
  1019. if (cfg->fc_nlflags & NLM_F_EXCL)
  1020. goto out;
  1021. /* We have 2 goals:
  1022. * 1. Find exact match for type, scope, fib_info to avoid
  1023. * duplicate routes
  1024. * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
  1025. */
  1026. fa_match = NULL;
  1027. fa_first = fa;
  1028. fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
  1029. list_for_each_entry_continue(fa, fa_head, fa_list) {
  1030. if (fa->fa_tos != tos)
  1031. break;
  1032. if (fa->fa_info->fib_priority != fi->fib_priority)
  1033. break;
  1034. if (fa->fa_type == cfg->fc_type &&
  1035. fa->fa_info == fi) {
  1036. fa_match = fa;
  1037. break;
  1038. }
  1039. }
  1040. if (cfg->fc_nlflags & NLM_F_REPLACE) {
  1041. struct fib_info *fi_drop;
  1042. u8 state;
  1043. fa = fa_first;
  1044. if (fa_match) {
  1045. if (fa == fa_match)
  1046. err = 0;
  1047. goto out;
  1048. }
  1049. err = -ENOBUFS;
  1050. new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
  1051. if (new_fa == NULL)
  1052. goto out;
  1053. fi_drop = fa->fa_info;
  1054. new_fa->fa_tos = fa->fa_tos;
  1055. new_fa->fa_info = fi;
  1056. new_fa->fa_type = cfg->fc_type;
  1057. state = fa->fa_state;
  1058. new_fa->fa_state = state & ~FA_S_ACCESSED;
  1059. list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
  1060. alias_free_mem_rcu(fa);
  1061. fib_release_info(fi_drop);
  1062. if (state & FA_S_ACCESSED)
  1063. rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
  1064. rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
  1065. tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
  1066. goto succeeded;
  1067. }
  1068. /* Error if we find a perfect match which
  1069. * uses the same scope, type, and nexthop
  1070. * information.
  1071. */
  1072. if (fa_match)
  1073. goto out;
  1074. if (!(cfg->fc_nlflags & NLM_F_APPEND))
  1075. fa = fa_first;
  1076. }
  1077. err = -ENOENT;
  1078. if (!(cfg->fc_nlflags & NLM_F_CREATE))
  1079. goto out;
  1080. err = -ENOBUFS;
  1081. new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
  1082. if (new_fa == NULL)
  1083. goto out;
  1084. new_fa->fa_info = fi;
  1085. new_fa->fa_tos = tos;
  1086. new_fa->fa_type = cfg->fc_type;
  1087. new_fa->fa_state = 0;
  1088. /*
  1089. * Insert new entry to the list.
  1090. */
  1091. if (!fa_head) {
  1092. fa_head = fib_insert_node(t, key, plen);
  1093. if (unlikely(!fa_head)) {
  1094. err = -ENOMEM;
  1095. goto out_free_new_fa;
  1096. }
  1097. }
  1098. if (!plen)
  1099. tb->tb_num_default++;
  1100. list_add_tail_rcu(&new_fa->fa_list,
  1101. (fa ? &fa->fa_list : fa_head));
  1102. rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
  1103. rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
  1104. &cfg->fc_nlinfo, 0);
  1105. succeeded:
  1106. return 0;
  1107. out_free_new_fa:
  1108. kmem_cache_free(fn_alias_kmem, new_fa);
  1109. out:
  1110. fib_release_info(fi);
  1111. err:
  1112. return err;
  1113. }
  1114. /* should be called with rcu_read_lock */
  1115. static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
  1116. t_key key, const struct flowi4 *flp,
  1117. struct fib_result *res, int fib_flags)
  1118. {
  1119. struct leaf_info *li;
  1120. struct hlist_head *hhead = &l->list;
  1121. struct hlist_node *node;
  1122. hlist_for_each_entry_rcu(li, node, hhead, hlist) {
  1123. struct fib_alias *fa;
  1124. if (l->key != (key & li->mask_plen))
  1125. continue;
  1126. list_for_each_entry_rcu(fa, &li->falh, fa_list) {
  1127. struct fib_info *fi = fa->fa_info;
  1128. int nhsel, err;
  1129. if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
  1130. continue;
  1131. if (fi->fib_dead)
  1132. continue;
  1133. if (fa->fa_info->fib_scope < flp->flowi4_scope)
  1134. continue;
  1135. fib_alias_accessed(fa);
  1136. err = fib_props[fa->fa_type].error;
  1137. if (err) {
  1138. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1139. t->stats.semantic_match_passed++;
  1140. #endif
  1141. return err;
  1142. }
  1143. if (fi->fib_flags & RTNH_F_DEAD)
  1144. continue;
  1145. for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
  1146. const struct fib_nh *nh = &fi->fib_nh[nhsel];
  1147. if (nh->nh_flags & RTNH_F_DEAD)
  1148. continue;
  1149. if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
  1150. continue;
  1151. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1152. t->stats.semantic_match_passed++;
  1153. #endif
  1154. res->prefixlen = li->plen;
  1155. res->nh_sel = nhsel;
  1156. res->type = fa->fa_type;
  1157. res->scope = fa->fa_info->fib_scope;
  1158. res->fi = fi;
  1159. res->table = tb;
  1160. res->fa_head = &li->falh;
  1161. if (!(fib_flags & FIB_LOOKUP_NOREF))
  1162. atomic_inc(&fi->fib_clntref);
  1163. return 0;
  1164. }
  1165. }
  1166. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1167. t->stats.semantic_match_miss++;
  1168. #endif
  1169. }
  1170. return 1;
  1171. }
  1172. int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
  1173. struct fib_result *res, int fib_flags)
  1174. {
  1175. struct trie *t = (struct trie *) tb->tb_data;
  1176. int ret;
  1177. struct rt_trie_node *n;
  1178. struct tnode *pn;
  1179. unsigned int pos, bits;
  1180. t_key key = ntohl(flp->daddr);
  1181. unsigned int chopped_off;
  1182. t_key cindex = 0;
  1183. unsigned int current_prefix_length = KEYLENGTH;
  1184. struct tnode *cn;
  1185. t_key pref_mismatch;
  1186. rcu_read_lock();
  1187. n = rcu_dereference(t->trie);
  1188. if (!n)
  1189. goto failed;
  1190. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1191. t->stats.gets++;
  1192. #endif
  1193. /* Just a leaf? */
  1194. if (IS_LEAF(n)) {
  1195. ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
  1196. goto found;
  1197. }
  1198. pn = (struct tnode *) n;
  1199. chopped_off = 0;
  1200. while (pn) {
  1201. pos = pn->pos;
  1202. bits = pn->bits;
  1203. if (!chopped_off)
  1204. cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
  1205. pos, bits);
  1206. n = tnode_get_child_rcu(pn, cindex);
  1207. if (n == NULL) {
  1208. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1209. t->stats.null_node_hit++;
  1210. #endif
  1211. goto backtrace;
  1212. }
  1213. if (IS_LEAF(n)) {
  1214. ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
  1215. if (ret > 0)
  1216. goto backtrace;
  1217. goto found;
  1218. }
  1219. cn = (struct tnode *)n;
  1220. /*
  1221. * It's a tnode, and we can do some extra checks here if we
  1222. * like, to avoid descending into a dead-end branch.
  1223. * This tnode is in the parent's child array at index
  1224. * key[p_pos..p_pos+p_bits] but potentially with some bits
  1225. * chopped off, so in reality the index may be just a
  1226. * subprefix, padded with zero at the end.
  1227. * We can also take a look at any skipped bits in this
  1228. * tnode - everything up to p_pos is supposed to be ok,
  1229. * and the non-chopped bits of the index (se previous
  1230. * paragraph) are also guaranteed ok, but the rest is
  1231. * considered unknown.
  1232. *
  1233. * The skipped bits are key[pos+bits..cn->pos].
  1234. */
  1235. /* If current_prefix_length < pos+bits, we are already doing
  1236. * actual prefix matching, which means everything from
  1237. * pos+(bits-chopped_off) onward must be zero along some
  1238. * branch of this subtree - otherwise there is *no* valid
  1239. * prefix present. Here we can only check the skipped
  1240. * bits. Remember, since we have already indexed into the
  1241. * parent's child array, we know that the bits we chopped of
  1242. * *are* zero.
  1243. */
  1244. /* NOTA BENE: Checking only skipped bits
  1245. for the new node here */
  1246. if (current_prefix_length < pos+bits) {
  1247. if (tkey_extract_bits(cn->key, current_prefix_length,
  1248. cn->pos - current_prefix_length)
  1249. || !(cn->child[0]))
  1250. goto backtrace;
  1251. }
  1252. /*
  1253. * If chopped_off=0, the index is fully validated and we
  1254. * only need to look at the skipped bits for this, the new,
  1255. * tnode. What we actually want to do is to find out if
  1256. * these skipped bits match our key perfectly, or if we will
  1257. * have to count on finding a matching prefix further down,
  1258. * because if we do, we would like to have some way of
  1259. * verifying the existence of such a prefix at this point.
  1260. */
  1261. /* The only thing we can do at this point is to verify that
  1262. * any such matching prefix can indeed be a prefix to our
  1263. * key, and if the bits in the node we are inspecting that
  1264. * do not match our key are not ZERO, this cannot be true.
  1265. * Thus, find out where there is a mismatch (before cn->pos)
  1266. * and verify that all the mismatching bits are zero in the
  1267. * new tnode's key.
  1268. */
  1269. /*
  1270. * Note: We aren't very concerned about the piece of
  1271. * the key that precede pn->pos+pn->bits, since these
  1272. * have already been checked. The bits after cn->pos
  1273. * aren't checked since these are by definition
  1274. * "unknown" at this point. Thus, what we want to see
  1275. * is if we are about to enter the "prefix matching"
  1276. * state, and in that case verify that the skipped
  1277. * bits that will prevail throughout this subtree are
  1278. * zero, as they have to be if we are to find a
  1279. * matching prefix.
  1280. */
  1281. pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
  1282. /*
  1283. * In short: If skipped bits in this node do not match
  1284. * the search key, enter the "prefix matching"
  1285. * state.directly.
  1286. */
  1287. if (pref_mismatch) {
  1288. int mp = KEYLENGTH - fls(pref_mismatch);
  1289. if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
  1290. goto backtrace;
  1291. if (current_prefix_length >= cn->pos)
  1292. current_prefix_length = mp;
  1293. }
  1294. pn = (struct tnode *)n; /* Descend */
  1295. chopped_off = 0;
  1296. continue;
  1297. backtrace:
  1298. chopped_off++;
  1299. /* As zero don't change the child key (cindex) */
  1300. while ((chopped_off <= pn->bits)
  1301. && !(cindex & (1<<(chopped_off-1))))
  1302. chopped_off++;
  1303. /* Decrease current_... with bits chopped off */
  1304. if (current_prefix_length > pn->pos + pn->bits - chopped_off)
  1305. current_prefix_length = pn->pos + pn->bits
  1306. - chopped_off;
  1307. /*
  1308. * Either we do the actual chop off according or if we have
  1309. * chopped off all bits in this tnode walk up to our parent.
  1310. */
  1311. if (chopped_off <= pn->bits) {
  1312. cindex &= ~(1 << (chopped_off-1));
  1313. } else {
  1314. struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
  1315. if (!parent)
  1316. goto failed;
  1317. /* Get Child's index */
  1318. cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
  1319. pn = parent;
  1320. chopped_off = 0;
  1321. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1322. t->stats.backtrack++;
  1323. #endif
  1324. goto backtrace;
  1325. }
  1326. }
  1327. failed:
  1328. ret = 1;
  1329. found:
  1330. rcu_read_unlock();
  1331. return ret;
  1332. }
  1333. EXPORT_SYMBOL_GPL(fib_table_lookup);
  1334. /*
  1335. * Remove the leaf and return parent.
  1336. */
  1337. static void trie_leaf_remove(struct trie *t, struct leaf *l)
  1338. {
  1339. struct tnode *tp = node_parent((struct rt_trie_node *) l);
  1340. pr_debug("entering trie_leaf_remove(%p)\n", l);
  1341. if (tp) {
  1342. t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
  1343. put_child(t, tp, cindex, NULL);
  1344. trie_rebalance(t, tp);
  1345. } else
  1346. RCU_INIT_POINTER(t->trie, NULL);
  1347. free_leaf(l);
  1348. }
  1349. /*
  1350. * Caller must hold RTNL.
  1351. */
  1352. int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
  1353. {
  1354. struct trie *t = (struct trie *) tb->tb_data;
  1355. u32 key, mask;
  1356. int plen = cfg->fc_dst_len;
  1357. u8 tos = cfg->fc_tos;
  1358. struct fib_alias *fa, *fa_to_delete;
  1359. struct list_head *fa_head;
  1360. struct leaf *l;
  1361. struct leaf_info *li;
  1362. if (plen > 32)
  1363. return -EINVAL;
  1364. key = ntohl(cfg->fc_dst);
  1365. mask = ntohl(inet_make_mask(plen));
  1366. if (key & ~mask)
  1367. return -EINVAL;
  1368. key = key & mask;
  1369. l = fib_find_node(t, key);
  1370. if (!l)
  1371. return -ESRCH;
  1372. fa_head = get_fa_head(l, plen);
  1373. fa = fib_find_alias(fa_head, tos, 0);
  1374. if (!fa)
  1375. return -ESRCH;
  1376. pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
  1377. fa_to_delete = NULL;
  1378. fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
  1379. list_for_each_entry_continue(fa, fa_head, fa_list) {
  1380. struct fib_info *fi = fa->fa_info;
  1381. if (fa->fa_tos != tos)
  1382. break;
  1383. if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
  1384. (cfg->fc_scope == RT_SCOPE_NOWHERE ||
  1385. fa->fa_info->fib_scope == cfg->fc_scope) &&
  1386. (!cfg->fc_prefsrc ||
  1387. fi->fib_prefsrc == cfg->fc_prefsrc) &&
  1388. (!cfg->fc_protocol ||
  1389. fi->fib_protocol == cfg->fc_protocol) &&
  1390. fib_nh_match(cfg, fi) == 0) {
  1391. fa_to_delete = fa;
  1392. break;
  1393. }
  1394. }
  1395. if (!fa_to_delete)
  1396. return -ESRCH;
  1397. fa = fa_to_delete;
  1398. rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
  1399. &cfg->fc_nlinfo, 0);
  1400. l = fib_find_node(t, key);
  1401. li = find_leaf_info(l, plen);
  1402. list_del_rcu(&fa->fa_list);
  1403. if (!plen)
  1404. tb->tb_num_default--;
  1405. if (list_empty(fa_head)) {
  1406. hlist_del_rcu(&li->hlist);
  1407. free_leaf_info(li);
  1408. }
  1409. if (hlist_empty(&l->list))
  1410. trie_leaf_remove(t, l);
  1411. if (fa->fa_state & FA_S_ACCESSED)
  1412. rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
  1413. fib_release_info(fa->fa_info);
  1414. alias_free_mem_rcu(fa);
  1415. return 0;
  1416. }
  1417. static int trie_flush_list(struct list_head *head)
  1418. {
  1419. struct fib_alias *fa, *fa_node;
  1420. int found = 0;
  1421. list_for_each_entry_safe(fa, fa_node, head, fa_list) {
  1422. struct fib_info *fi = fa->fa_info;
  1423. if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
  1424. list_del_rcu(&fa->fa_list);
  1425. fib_release_info(fa->fa_info);
  1426. alias_free_mem_rcu(fa);
  1427. found++;
  1428. }
  1429. }
  1430. return found;
  1431. }
  1432. static int trie_flush_leaf(struct leaf *l)
  1433. {
  1434. int found = 0;
  1435. struct hlist_head *lih = &l->list;
  1436. struct hlist_node *node, *tmp;
  1437. struct leaf_info *li = NULL;
  1438. hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
  1439. found += trie_flush_list(&li->falh);
  1440. if (list_empty(&li->falh)) {
  1441. hlist_del_rcu(&li->hlist);
  1442. free_leaf_info(li);
  1443. }
  1444. }
  1445. return found;
  1446. }
  1447. /*
  1448. * Scan for the next right leaf starting at node p->child[idx]
  1449. * Since we have back pointer, no recursion necessary.
  1450. */
  1451. static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
  1452. {
  1453. do {
  1454. t_key idx;
  1455. if (c)
  1456. idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
  1457. else
  1458. idx = 0;
  1459. while (idx < 1u << p->bits) {
  1460. c = tnode_get_child_rcu(p, idx++);
  1461. if (!c)
  1462. continue;
  1463. if (IS_LEAF(c))
  1464. return (struct leaf *) c;
  1465. /* Rescan start scanning in new node */
  1466. p = (struct tnode *) c;
  1467. idx = 0;
  1468. }
  1469. /* Node empty, walk back up to parent */
  1470. c = (struct rt_trie_node *) p;
  1471. } while ((p = node_parent_rcu(c)) != NULL);
  1472. return NULL; /* Root of trie */
  1473. }
  1474. static struct leaf *trie_firstleaf(struct trie *t)
  1475. {
  1476. struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
  1477. if (!n)
  1478. return NULL;
  1479. if (IS_LEAF(n)) /* trie is just a leaf */
  1480. return (struct leaf *) n;
  1481. return leaf_walk_rcu(n, NULL);
  1482. }
  1483. static struct leaf *trie_nextleaf(struct leaf *l)
  1484. {
  1485. struct rt_trie_node *c = (struct rt_trie_node *) l;
  1486. struct tnode *p = node_parent_rcu(c);
  1487. if (!p)
  1488. return NULL; /* trie with just one leaf */
  1489. return leaf_walk_rcu(p, c);
  1490. }
  1491. static struct leaf *trie_leafindex(struct trie *t, int index)
  1492. {
  1493. struct leaf *l = trie_firstleaf(t);
  1494. while (l && index-- > 0)
  1495. l = trie_nextleaf(l);
  1496. return l;
  1497. }
  1498. /*
  1499. * Caller must hold RTNL.
  1500. */
  1501. int fib_table_flush(struct fib_table *tb)
  1502. {
  1503. struct trie *t = (struct trie *) tb->tb_data;
  1504. struct leaf *l, *ll = NULL;
  1505. int found = 0;
  1506. for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
  1507. found += trie_flush_leaf(l);
  1508. if (ll && hlist_empty(&ll->list))
  1509. trie_leaf_remove(t, ll);
  1510. ll = l;
  1511. }
  1512. if (ll && hlist_empty(&ll->list))
  1513. trie_leaf_remove(t, ll);
  1514. pr_debug("trie_flush found=%d\n", found);
  1515. return found;
  1516. }
  1517. void fib_free_table(struct fib_table *tb)
  1518. {
  1519. kfree(tb);
  1520. }
  1521. static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
  1522. struct fib_table *tb,
  1523. struct sk_buff *skb, struct netlink_callback *cb)
  1524. {
  1525. int i, s_i;
  1526. struct fib_alias *fa;
  1527. __be32 xkey = htonl(key);
  1528. s_i = cb->args[5];
  1529. i = 0;
  1530. /* rcu_read_lock is hold by caller */
  1531. list_for_each_entry_rcu(fa, fah, fa_list) {
  1532. if (i < s_i) {
  1533. i++;
  1534. continue;
  1535. }
  1536. if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
  1537. cb->nlh->nlmsg_seq,
  1538. RTM_NEWROUTE,
  1539. tb->tb_id,
  1540. fa->fa_type,
  1541. xkey,
  1542. plen,
  1543. fa->fa_tos,
  1544. fa->fa_info, NLM_F_MULTI) < 0) {
  1545. cb->args[5] = i;
  1546. return -1;
  1547. }
  1548. i++;
  1549. }
  1550. cb->args[5] = i;
  1551. return skb->len;
  1552. }
  1553. static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
  1554. struct sk_buff *skb, struct netlink_callback *cb)
  1555. {
  1556. struct leaf_info *li;
  1557. struct hlist_node *node;
  1558. int i, s_i;
  1559. s_i = cb->args[4];
  1560. i = 0;
  1561. /* rcu_read_lock is hold by caller */
  1562. hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
  1563. if (i < s_i) {
  1564. i++;
  1565. continue;
  1566. }
  1567. if (i > s_i)
  1568. cb->args[5] = 0;
  1569. if (list_empty(&li->falh))
  1570. continue;
  1571. if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
  1572. cb->args[4] = i;
  1573. return -1;
  1574. }
  1575. i++;
  1576. }
  1577. cb->args[4] = i;
  1578. return skb->len;
  1579. }
  1580. int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
  1581. struct netlink_callback *cb)
  1582. {
  1583. struct leaf *l;
  1584. struct trie *t = (struct trie *) tb->tb_data;
  1585. t_key key = cb->args[2];
  1586. int count = cb->args[3];
  1587. rcu_read_lock();
  1588. /* Dump starting at last key.
  1589. * Note: 0.0.0.0/0 (ie default) is first key.
  1590. */
  1591. if (count == 0)
  1592. l = trie_firstleaf(t);
  1593. else {
  1594. /* Normally, continue from last key, but if that is missing
  1595. * fallback to using slow rescan
  1596. */
  1597. l = fib_find_node(t, key);
  1598. if (!l)
  1599. l = trie_leafindex(t, count);
  1600. }
  1601. while (l) {
  1602. cb->args[2] = l->key;
  1603. if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
  1604. cb->args[3] = count;
  1605. rcu_read_unlock();
  1606. return -1;
  1607. }
  1608. ++count;
  1609. l = trie_nextleaf(l);
  1610. memset(&cb->args[4], 0,
  1611. sizeof(cb->args) - 4*sizeof(cb->args[0]));
  1612. }
  1613. cb->args[3] = count;
  1614. rcu_read_unlock();
  1615. return skb->len;
  1616. }
  1617. void __init fib_trie_init(void)
  1618. {
  1619. fn_alias_kmem = kmem_cache_create("ip_fib_alias",
  1620. sizeof(struct fib_alias),
  1621. 0, SLAB_PANIC, NULL);
  1622. trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
  1623. max(sizeof(struct leaf),
  1624. sizeof(struct leaf_info)),
  1625. 0, SLAB_PANIC, NULL);
  1626. }
  1627. struct fib_table *fib_trie_table(u32 id)
  1628. {
  1629. struct fib_table *tb;
  1630. struct trie *t;
  1631. tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
  1632. GFP_KERNEL);
  1633. if (tb == NULL)
  1634. return NULL;
  1635. tb->tb_id = id;
  1636. tb->tb_default = -1;
  1637. tb->tb_num_default = 0;
  1638. t = (struct trie *) tb->tb_data;
  1639. memset(t, 0, sizeof(*t));
  1640. return tb;
  1641. }
  1642. #ifdef CONFIG_PROC_FS
  1643. /* Depth first Trie walk iterator */
  1644. struct fib_trie_iter {
  1645. struct seq_net_private p;
  1646. struct fib_table *tb;
  1647. struct tnode *tnode;
  1648. unsigned int index;
  1649. unsigned int depth;
  1650. };
  1651. static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
  1652. {
  1653. struct tnode *tn = iter->tnode;
  1654. unsigned int cindex = iter->index;
  1655. struct tnode *p;
  1656. /* A single entry routing table */
  1657. if (!tn)
  1658. return NULL;
  1659. pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
  1660. iter->tnode, iter->index, iter->depth);
  1661. rescan:
  1662. while (cindex < (1<<tn->bits)) {
  1663. struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
  1664. if (n) {
  1665. if (IS_LEAF(n)) {
  1666. iter->tnode = tn;
  1667. iter->index = cindex + 1;
  1668. } else {
  1669. /* push down one level */
  1670. iter->tnode = (struct tnode *) n;
  1671. iter->index = 0;
  1672. ++iter->depth;
  1673. }
  1674. return n;
  1675. }
  1676. ++cindex;
  1677. }
  1678. /* Current node exhausted, pop back up */
  1679. p = node_parent_rcu((struct rt_trie_node *)tn);
  1680. if (p) {
  1681. cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
  1682. tn = p;
  1683. --iter->depth;
  1684. goto rescan;
  1685. }
  1686. /* got root? */
  1687. return NULL;
  1688. }
  1689. static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
  1690. struct trie *t)
  1691. {
  1692. struct rt_trie_node *n;
  1693. if (!t)
  1694. return NULL;
  1695. n = rcu_dereference(t->trie);
  1696. if (!n)
  1697. return NULL;
  1698. if (IS_TNODE(n)) {
  1699. iter->tnode = (struct tnode *) n;
  1700. iter->index = 0;
  1701. iter->depth = 1;
  1702. } else {
  1703. iter->tnode = NULL;
  1704. iter->index = 0;
  1705. iter->depth = 0;
  1706. }
  1707. return n;
  1708. }
  1709. static void trie_collect_stats(struct trie *t, struct trie_stat *s)
  1710. {
  1711. struct rt_trie_node *n;
  1712. struct fib_trie_iter iter;
  1713. memset(s, 0, sizeof(*s));
  1714. rcu_read_lock();
  1715. for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
  1716. if (IS_LEAF(n)) {
  1717. struct leaf *l = (struct leaf *)n;
  1718. struct leaf_info *li;
  1719. struct hlist_node *tmp;
  1720. s->leaves++;
  1721. s->totdepth += iter.depth;
  1722. if (iter.depth > s->maxdepth)
  1723. s->maxdepth = iter.depth;
  1724. hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
  1725. ++s->prefixes;
  1726. } else {
  1727. const struct tnode *tn = (const struct tnode *) n;
  1728. int i;
  1729. s->tnodes++;
  1730. if (tn->bits < MAX_STAT_DEPTH)
  1731. s->nodesizes[tn->bits]++;
  1732. for (i = 0; i < (1<<tn->bits); i++)
  1733. if (!tn->child[i])
  1734. s->nullpointers++;
  1735. }
  1736. }
  1737. rcu_read_unlock();
  1738. }
  1739. /*
  1740. * This outputs /proc/net/fib_triestats
  1741. */
  1742. static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
  1743. {
  1744. unsigned int i, max, pointers, bytes, avdepth;
  1745. if (stat->leaves)
  1746. avdepth = stat->totdepth*100 / stat->leaves;
  1747. else
  1748. avdepth = 0;
  1749. seq_printf(seq, "\tAver depth: %u.%02d\n",
  1750. avdepth / 100, avdepth % 100);
  1751. seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
  1752. seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
  1753. bytes = sizeof(struct leaf) * stat->leaves;
  1754. seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
  1755. bytes += sizeof(struct leaf_info) * stat->prefixes;
  1756. seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
  1757. bytes += sizeof(struct tnode) * stat->tnodes;
  1758. max = MAX_STAT_DEPTH;
  1759. while (max > 0 && stat->nodesizes[max-1] == 0)
  1760. max--;
  1761. pointers = 0;
  1762. for (i = 1; i <= max; i++)
  1763. if (stat->nodesizes[i] != 0) {
  1764. seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
  1765. pointers += (1<<i) * stat->nodesizes[i];
  1766. }
  1767. seq_putc(seq, '\n');
  1768. seq_printf(seq, "\tPointers: %u\n", pointers);
  1769. bytes += sizeof(struct rt_trie_node *) * pointers;
  1770. seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
  1771. seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
  1772. }
  1773. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1774. static void trie_show_usage(struct seq_file *seq,
  1775. const struct trie_use_stats *stats)
  1776. {
  1777. seq_printf(seq, "\nCounters:\n---------\n");
  1778. seq_printf(seq, "gets = %u\n", stats->gets);
  1779. seq_printf(seq, "backtracks = %u\n", stats->backtrack);
  1780. seq_printf(seq, "semantic match passed = %u\n",
  1781. stats->semantic_match_passed);
  1782. seq_printf(seq, "semantic match miss = %u\n",
  1783. stats->semantic_match_miss);
  1784. seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
  1785. seq_printf(seq, "skipped node resize = %u\n\n",
  1786. stats->resize_node_skipped);
  1787. }
  1788. #endif /* CONFIG_IP_FIB_TRIE_STATS */
  1789. static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
  1790. {
  1791. if (tb->tb_id == RT_TABLE_LOCAL)
  1792. seq_puts(seq, "Local:\n");
  1793. else if (tb->tb_id == RT_TABLE_MAIN)
  1794. seq_puts(seq, "Main:\n");
  1795. else
  1796. seq_printf(seq, "Id %d:\n", tb->tb_id);
  1797. }
  1798. static int fib_triestat_seq_show(struct seq_file *seq, void *v)
  1799. {
  1800. struct net *net = (struct net *)seq->private;
  1801. unsigned int h;
  1802. seq_printf(seq,
  1803. "Basic info: size of leaf:"
  1804. " %Zd bytes, size of tnode: %Zd bytes.\n",
  1805. sizeof(struct leaf), sizeof(struct tnode));
  1806. for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
  1807. struct hlist_head *head = &net->ipv4.fib_table_hash[h];
  1808. struct hlist_node *node;
  1809. struct fib_table *tb;
  1810. hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
  1811. struct trie *t = (struct trie *) tb->tb_data;
  1812. struct trie_stat stat;
  1813. if (!t)
  1814. continue;
  1815. fib_table_print(seq, tb);
  1816. trie_collect_stats(t, &stat);
  1817. trie_show_stats(seq, &stat);
  1818. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1819. trie_show_usage(seq, &t->stats);
  1820. #endif
  1821. }
  1822. }
  1823. return 0;
  1824. }
  1825. static int fib_triestat_seq_open(struct inode *inode, struct file *file)
  1826. {
  1827. return single_open_net(inode, file, fib_triestat_seq_show);
  1828. }
  1829. static const struct file_operations fib_triestat_fops = {
  1830. .owner = THIS_MODULE,
  1831. .open = fib_triestat_seq_open,
  1832. .read = seq_read,
  1833. .llseek = seq_lseek,
  1834. .release = single_release_net,
  1835. };
  1836. static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
  1837. {
  1838. struct fib_trie_iter *iter = seq->private;
  1839. struct net *net = seq_file_net(seq);
  1840. loff_t idx = 0;
  1841. unsigned int h;
  1842. for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
  1843. struct hlist_head *head = &net->ipv4.fib_table_hash[h];
  1844. struct hlist_node *node;
  1845. struct fib_table *tb;
  1846. hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
  1847. struct rt_trie_node *n;
  1848. for (n = fib_trie_get_first(iter,
  1849. (struct trie *) tb->tb_data);
  1850. n; n = fib_trie_get_next(iter))
  1851. if (pos == idx++) {
  1852. iter->tb = tb;
  1853. return n;
  1854. }
  1855. }
  1856. }
  1857. return NULL;
  1858. }
  1859. static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
  1860. __acquires(RCU)
  1861. {
  1862. rcu_read_lock();
  1863. return fib_trie_get_idx(seq, *pos);
  1864. }
  1865. static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  1866. {
  1867. struct fib_trie_iter *iter = seq->private;
  1868. struct net *net = seq_file_net(seq);
  1869. struct fib_table *tb = iter->tb;
  1870. struct hlist_node *tb_node;
  1871. unsigned int h;
  1872. struct rt_trie_node *n;
  1873. ++*pos;
  1874. /* next node in same table */
  1875. n = fib_trie_get_next(iter);
  1876. if (n)
  1877. return n;
  1878. /* walk rest of this hash chain */
  1879. h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
  1880. while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
  1881. tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
  1882. n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
  1883. if (n)
  1884. goto found;
  1885. }
  1886. /* new hash chain */
  1887. while (++h < FIB_TABLE_HASHSZ) {
  1888. struct hlist_head *head = &net->ipv4.fib_table_hash[h];
  1889. hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
  1890. n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
  1891. if (n)
  1892. goto found;
  1893. }
  1894. }
  1895. return NULL;
  1896. found:
  1897. iter->tb = tb;
  1898. return n;
  1899. }
  1900. static void fib_trie_seq_stop(struct seq_file *seq, void *v)
  1901. __releases(RCU)
  1902. {
  1903. rcu_read_unlock();
  1904. }
  1905. static void seq_indent(struct seq_file *seq, int n)
  1906. {
  1907. while (n-- > 0)
  1908. seq_puts(seq, " ");
  1909. }
  1910. static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
  1911. {
  1912. switch (s) {
  1913. case RT_SCOPE_UNIVERSE: return "universe";
  1914. case RT_SCOPE_SITE: return "site";
  1915. case RT_SCOPE_LINK: return "link";
  1916. case RT_SCOPE_HOST: return "host";
  1917. case RT_SCOPE_NOWHERE: return "nowhere";
  1918. default:
  1919. snprintf(buf, len, "scope=%d", s);
  1920. return buf;
  1921. }
  1922. }
  1923. static const char *const rtn_type_names[__RTN_MAX] = {
  1924. [RTN_UNSPEC] = "UNSPEC",
  1925. [RTN_UNICAST] = "UNICAST",
  1926. [RTN_LOCAL] = "LOCAL",
  1927. [RTN_BROADCAST] = "BROADCAST",
  1928. [RTN_ANYCAST] = "ANYCAST",
  1929. [RTN_MULTICAST] = "MULTICAST",
  1930. [RTN_BLACKHOLE] = "BLACKHOLE",
  1931. [RTN_UNREACHABLE] = "UNREACHABLE",
  1932. [RTN_PROHIBIT] = "PROHIBIT",
  1933. [RTN_THROW] = "THROW",
  1934. [RTN_NAT] = "NAT",
  1935. [RTN_XRESOLVE] = "XRESOLVE",
  1936. };
  1937. static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
  1938. {
  1939. if (t < __RTN_MAX && rtn_type_names[t])
  1940. return rtn_type_names[t];
  1941. snprintf(buf, len, "type %u", t);
  1942. return buf;
  1943. }
  1944. /* Pretty print the trie */
  1945. static int fib_trie_seq_show(struct seq_file *seq, void *v)
  1946. {
  1947. const struct fib_trie_iter *iter = seq->private;
  1948. struct rt_trie_node *n = v;
  1949. if (!node_parent_rcu(n))
  1950. fib_table_print(seq, iter->tb);
  1951. if (IS_TNODE(n)) {
  1952. struct tnode *tn = (struct tnode *) n;
  1953. __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
  1954. seq_indent(seq, iter->depth-1);
  1955. seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
  1956. &prf, tn->pos, tn->bits, tn->full_children,
  1957. tn->empty_children);
  1958. } else {
  1959. struct leaf *l = (struct leaf *) n;
  1960. struct leaf_info *li;
  1961. struct hlist_node *node;
  1962. __be32 val = htonl(l->key);
  1963. seq_indent(seq, iter->depth);
  1964. seq_printf(seq, " |-- %pI4\n", &val);
  1965. hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
  1966. struct fib_alias *fa;
  1967. list_for_each_entry_rcu(fa, &li->falh, fa_list) {
  1968. char buf1[32], buf2[32];
  1969. seq_indent(seq, iter->depth+1);
  1970. seq_printf(seq, " /%d %s %s", li->plen,
  1971. rtn_scope(buf1, sizeof(buf1),
  1972. fa->fa_info->fib_scope),
  1973. rtn_type(buf2, sizeof(buf2),
  1974. fa->fa_type));
  1975. if (fa->fa_tos)
  1976. seq_printf(seq, " tos=%d", fa->fa_tos);
  1977. seq_putc(seq, '\n');
  1978. }
  1979. }
  1980. }
  1981. return 0;
  1982. }
  1983. static const struct seq_operations fib_trie_seq_ops = {
  1984. .start = fib_trie_seq_start,
  1985. .next = fib_trie_seq_next,
  1986. .stop = fib_trie_seq_stop,
  1987. .show = fib_trie_seq_show,
  1988. };
  1989. static int fib_trie_seq_open(struct inode *inode, struct file *file)
  1990. {
  1991. return seq_open_net(inode, file, &fib_trie_seq_ops,
  1992. sizeof(struct fib_trie_iter));
  1993. }
  1994. static const struct file_operations fib_trie_fops = {
  1995. .owner = THIS_MODULE,
  1996. .open = fib_trie_seq_open,
  1997. .read = seq_read,
  1998. .llseek = seq_lseek,
  1999. .release = seq_release_net,
  2000. };
  2001. struct fib_route_iter {
  2002. struct seq_net_private p;
  2003. struct trie *main_trie;
  2004. loff_t pos;
  2005. t_key key;
  2006. };
  2007. static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
  2008. {
  2009. struct leaf *l = NULL;
  2010. struct trie *t = iter->main_trie;
  2011. /* use cache location of last found key */
  2012. if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
  2013. pos -= iter->pos;
  2014. else {
  2015. iter->pos = 0;
  2016. l = trie_firstleaf(t);
  2017. }
  2018. while (l && pos-- > 0) {
  2019. iter->pos++;
  2020. l = trie_nextleaf(l);
  2021. }
  2022. if (l)
  2023. iter->key = pos; /* remember it */
  2024. else
  2025. iter->pos = 0; /* forget it */
  2026. return l;
  2027. }
  2028. static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
  2029. __acquires(RCU)
  2030. {
  2031. struct fib_route_iter *iter = seq->private;
  2032. struct fib_table *tb;
  2033. rcu_read_lock();
  2034. tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
  2035. if (!tb)
  2036. return NULL;
  2037. iter->main_trie = (struct trie *) tb->tb_data;
  2038. if (*pos == 0)
  2039. return SEQ_START_TOKEN;
  2040. else
  2041. return fib_route_get_idx(iter, *pos - 1);
  2042. }
  2043. static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  2044. {
  2045. struct fib_route_iter *iter = seq->private;
  2046. struct leaf *l = v;
  2047. ++*pos;
  2048. if (v == SEQ_START_TOKEN) {
  2049. iter->pos = 0;
  2050. l = trie_firstleaf(iter->main_trie);
  2051. } else {
  2052. iter->pos++;
  2053. l = trie_nextleaf(l);
  2054. }
  2055. if (l)
  2056. iter->key = l->key;
  2057. else
  2058. iter->pos = 0;
  2059. return l;
  2060. }
  2061. static void fib_route_seq_stop(struct seq_file *seq, void *v)
  2062. __releases(RCU)
  2063. {
  2064. rcu_read_unlock();
  2065. }
  2066. static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
  2067. {
  2068. unsigned int flags = 0;
  2069. if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
  2070. flags = RTF_REJECT;
  2071. if (fi && fi->fib_nh->nh_gw)
  2072. flags |= RTF_GATEWAY;
  2073. if (mask == htonl(0xFFFFFFFF))
  2074. flags |= RTF_HOST;
  2075. flags |= RTF_UP;
  2076. return flags;
  2077. }
  2078. /*
  2079. * This outputs /proc/net/route.
  2080. * The format of the file is not supposed to be changed
  2081. * and needs to be same as fib_hash output to avoid breaking
  2082. * legacy utilities
  2083. */
  2084. static int fib_route_seq_show(struct seq_file *seq, void *v)
  2085. {
  2086. struct leaf *l = v;
  2087. struct leaf_info *li;
  2088. struct hlist_node *node;
  2089. if (v == SEQ_START_TOKEN) {
  2090. seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
  2091. "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
  2092. "\tWindow\tIRTT");
  2093. return 0;
  2094. }
  2095. hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
  2096. struct fib_alias *fa;
  2097. __be32 mask, prefix;
  2098. mask = inet_make_mask(li->plen);
  2099. prefix = htonl(l->key);
  2100. list_for_each_entry_rcu(fa, &li->falh, fa_list) {
  2101. const struct fib_info *fi = fa->fa_info;
  2102. unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
  2103. if (fa->fa_type == RTN_BROADCAST
  2104. || fa->fa_type == RTN_MULTICAST)
  2105. continue;
  2106. seq_setwidth(seq, 127);
  2107. if (fi)
  2108. seq_printf(seq,
  2109. "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
  2110. "%d\t%08X\t%d\t%u\t%u",
  2111. fi->fib_dev ? fi->fib_dev->name : "*",
  2112. prefix,
  2113. fi->fib_nh->nh_gw, flags, 0, 0,
  2114. fi->fib_priority,
  2115. mask,
  2116. (fi->fib_advmss ?
  2117. fi->fib_advmss + 40 : 0),
  2118. fi->fib_window,
  2119. fi->fib_rtt >> 3);
  2120. else
  2121. seq_printf(seq,
  2122. "*\t%08X\t%08X\t%04X\t%d\t%u\t"
  2123. "%d\t%08X\t%d\t%u\t%u",
  2124. prefix, 0, flags, 0, 0, 0,
  2125. mask, 0, 0, 0);
  2126. seq_pad(seq, '\n');
  2127. }
  2128. }
  2129. return 0;
  2130. }
  2131. static const struct seq_operations fib_route_seq_ops = {
  2132. .start = fib_route_seq_start,
  2133. .next = fib_route_seq_next,
  2134. .stop = fib_route_seq_stop,
  2135. .show = fib_route_seq_show,
  2136. };
  2137. static int fib_route_seq_open(struct inode *inode, struct file *file)
  2138. {
  2139. return seq_open_net(inode, file, &fib_route_seq_ops,
  2140. sizeof(struct fib_route_iter));
  2141. }
  2142. static const struct file_operations fib_route_fops = {
  2143. .owner = THIS_MODULE,
  2144. .open = fib_route_seq_open,
  2145. .read = seq_read,
  2146. .llseek = seq_lseek,
  2147. .release = seq_release_net,
  2148. };
  2149. int __net_init fib_proc_init(struct net *net)
  2150. {
  2151. if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
  2152. goto out1;
  2153. if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
  2154. &fib_triestat_fops))
  2155. goto out2;
  2156. if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
  2157. goto out3;
  2158. return 0;
  2159. out3:
  2160. proc_net_remove(net, "fib_triestat");
  2161. out2:
  2162. proc_net_remove(net, "fib_trie");
  2163. out1:
  2164. return -ENOMEM;
  2165. }
  2166. void __net_exit fib_proc_exit(struct net *net)
  2167. {
  2168. proc_net_remove(net, "fib_trie");
  2169. proc_net_remove(net, "fib_triestat");
  2170. proc_net_remove(net, "route");
  2171. }
  2172. #endif /* CONFIG_PROC_FS */