l2t.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. /*
  2. * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
  3. *
  4. * This software is available to you under a choice of one of two
  5. * licenses. You may choose to be licensed under the terms of the GNU
  6. * General Public License (GPL) Version 2, available from the file
  7. * COPYING in the main directory of this source tree, or the
  8. * OpenIB.org BSD license below:
  9. *
  10. * Redistribution and use in source and binary forms, with or
  11. * without modification, are permitted provided that the following
  12. * conditions are met:
  13. *
  14. * - Redistributions of source code must retain the above
  15. * copyright notice, this list of conditions and the following
  16. * disclaimer.
  17. *
  18. * - Redistributions in binary form must reproduce the above
  19. * copyright notice, this list of conditions and the following
  20. * disclaimer in the documentation and/or other materials
  21. * provided with the distribution.
  22. *
  23. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27. * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28. * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30. * SOFTWARE.
  31. */
  32. #include <linux/skbuff.h>
  33. #include <linux/netdevice.h>
  34. #include <linux/if.h>
  35. #include <linux/if_vlan.h>
  36. #include <linux/jhash.h>
  37. #include <linux/slab.h>
  38. #include <linux/export.h>
  39. #include <net/neighbour.h>
  40. #include "common.h"
  41. #include "t3cdev.h"
  42. #include "cxgb3_defs.h"
  43. #include "l2t.h"
  44. #include "t3_cpl.h"
  45. #include "firmware_exports.h"
  46. #define VLAN_NONE 0xfff
  47. /*
  48. * Module locking notes: There is a RW lock protecting the L2 table as a
  49. * whole plus a spinlock per L2T entry. Entry lookups and allocations happen
  50. * under the protection of the table lock, individual entry changes happen
  51. * while holding that entry's spinlock. The table lock nests outside the
  52. * entry locks. Allocations of new entries take the table lock as writers so
  53. * no other lookups can happen while allocating new entries. Entry updates
  54. * take the table lock as readers so multiple entries can be updated in
  55. * parallel. An L2T entry can be dropped by decrementing its reference count
  56. * and therefore can happen in parallel with entry allocation but no entry
  57. * can change state or increment its ref count during allocation as both of
  58. * these perform lookups.
  59. */
  60. static inline unsigned int vlan_prio(const struct l2t_entry *e)
  61. {
  62. return e->vlan >> 13;
  63. }
  64. static inline unsigned int arp_hash(u32 key, int ifindex,
  65. const struct l2t_data *d)
  66. {
  67. return jhash_2words(key, ifindex, 0) & (d->nentries - 1);
  68. }
  69. static inline void neigh_replace(struct l2t_entry *e, struct neighbour *n)
  70. {
  71. neigh_hold(n);
  72. if (e->neigh)
  73. neigh_release(e->neigh);
  74. e->neigh = n;
  75. }
  76. /*
  77. * Set up an L2T entry and send any packets waiting in the arp queue. The
  78. * supplied skb is used for the CPL_L2T_WRITE_REQ. Must be called with the
  79. * entry locked.
  80. */
  81. static int setup_l2e_send_pending(struct t3cdev *dev, struct sk_buff *skb,
  82. struct l2t_entry *e)
  83. {
  84. struct cpl_l2t_write_req *req;
  85. struct sk_buff *tmp;
  86. if (!skb) {
  87. skb = alloc_skb(sizeof(*req), GFP_ATOMIC);
  88. if (!skb)
  89. return -ENOMEM;
  90. }
  91. req = (struct cpl_l2t_write_req *)__skb_put(skb, sizeof(*req));
  92. req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
  93. OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx));
  94. req->params = htonl(V_L2T_W_IDX(e->idx) | V_L2T_W_IFF(e->smt_idx) |
  95. V_L2T_W_VLAN(e->vlan & VLAN_VID_MASK) |
  96. V_L2T_W_PRIO(vlan_prio(e)));
  97. memcpy(e->dmac, e->neigh->ha, sizeof(e->dmac));
  98. memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
  99. skb->priority = CPL_PRIORITY_CONTROL;
  100. cxgb3_ofld_send(dev, skb);
  101. skb_queue_walk_safe(&e->arpq, skb, tmp) {
  102. __skb_unlink(skb, &e->arpq);
  103. cxgb3_ofld_send(dev, skb);
  104. }
  105. e->state = L2T_STATE_VALID;
  106. return 0;
  107. }
  108. /*
  109. * Add a packet to the an L2T entry's queue of packets awaiting resolution.
  110. * Must be called with the entry's lock held.
  111. */
  112. static inline void arpq_enqueue(struct l2t_entry *e, struct sk_buff *skb)
  113. {
  114. __skb_queue_tail(&e->arpq, skb);
  115. }
  116. int t3_l2t_send_slow(struct t3cdev *dev, struct sk_buff *skb,
  117. struct l2t_entry *e)
  118. {
  119. again:
  120. switch (e->state) {
  121. case L2T_STATE_STALE: /* entry is stale, kick off revalidation */
  122. neigh_event_send(e->neigh, NULL);
  123. spin_lock_bh(&e->lock);
  124. if (e->state == L2T_STATE_STALE)
  125. e->state = L2T_STATE_VALID;
  126. spin_unlock_bh(&e->lock);
  127. case L2T_STATE_VALID: /* fast-path, send the packet on */
  128. return cxgb3_ofld_send(dev, skb);
  129. case L2T_STATE_RESOLVING:
  130. spin_lock_bh(&e->lock);
  131. if (e->state != L2T_STATE_RESOLVING) {
  132. /* ARP already completed */
  133. spin_unlock_bh(&e->lock);
  134. goto again;
  135. }
  136. arpq_enqueue(e, skb);
  137. spin_unlock_bh(&e->lock);
  138. /*
  139. * Only the first packet added to the arpq should kick off
  140. * resolution. However, because the alloc_skb below can fail,
  141. * we allow each packet added to the arpq to retry resolution
  142. * as a way of recovering from transient memory exhaustion.
  143. * A better way would be to use a work request to retry L2T
  144. * entries when there's no memory.
  145. */
  146. if (!neigh_event_send(e->neigh, NULL)) {
  147. skb = alloc_skb(sizeof(struct cpl_l2t_write_req),
  148. GFP_ATOMIC);
  149. if (!skb)
  150. break;
  151. spin_lock_bh(&e->lock);
  152. if (!skb_queue_empty(&e->arpq))
  153. setup_l2e_send_pending(dev, skb, e);
  154. else /* we lost the race */
  155. __kfree_skb(skb);
  156. spin_unlock_bh(&e->lock);
  157. }
  158. }
  159. return 0;
  160. }
  161. EXPORT_SYMBOL(t3_l2t_send_slow);
  162. void t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e)
  163. {
  164. again:
  165. switch (e->state) {
  166. case L2T_STATE_STALE: /* entry is stale, kick off revalidation */
  167. neigh_event_send(e->neigh, NULL);
  168. spin_lock_bh(&e->lock);
  169. if (e->state == L2T_STATE_STALE) {
  170. e->state = L2T_STATE_VALID;
  171. }
  172. spin_unlock_bh(&e->lock);
  173. return;
  174. case L2T_STATE_VALID: /* fast-path, send the packet on */
  175. return;
  176. case L2T_STATE_RESOLVING:
  177. spin_lock_bh(&e->lock);
  178. if (e->state != L2T_STATE_RESOLVING) {
  179. /* ARP already completed */
  180. spin_unlock_bh(&e->lock);
  181. goto again;
  182. }
  183. spin_unlock_bh(&e->lock);
  184. /*
  185. * Only the first packet added to the arpq should kick off
  186. * resolution. However, because the alloc_skb below can fail,
  187. * we allow each packet added to the arpq to retry resolution
  188. * as a way of recovering from transient memory exhaustion.
  189. * A better way would be to use a work request to retry L2T
  190. * entries when there's no memory.
  191. */
  192. neigh_event_send(e->neigh, NULL);
  193. }
  194. }
  195. EXPORT_SYMBOL(t3_l2t_send_event);
  196. /*
  197. * Allocate a free L2T entry. Must be called with l2t_data.lock held.
  198. */
  199. static struct l2t_entry *alloc_l2e(struct l2t_data *d)
  200. {
  201. struct l2t_entry *end, *e, **p;
  202. if (!atomic_read(&d->nfree))
  203. return NULL;
  204. /* there's definitely a free entry */
  205. for (e = d->rover, end = &d->l2tab[d->nentries]; e != end; ++e)
  206. if (atomic_read(&e->refcnt) == 0)
  207. goto found;
  208. for (e = &d->l2tab[1]; atomic_read(&e->refcnt); ++e) ;
  209. found:
  210. d->rover = e + 1;
  211. atomic_dec(&d->nfree);
  212. /*
  213. * The entry we found may be an inactive entry that is
  214. * presently in the hash table. We need to remove it.
  215. */
  216. if (e->state != L2T_STATE_UNUSED) {
  217. int hash = arp_hash(e->addr, e->ifindex, d);
  218. for (p = &d->l2tab[hash].first; *p; p = &(*p)->next)
  219. if (*p == e) {
  220. *p = e->next;
  221. break;
  222. }
  223. e->state = L2T_STATE_UNUSED;
  224. }
  225. return e;
  226. }
  227. /*
  228. * Called when an L2T entry has no more users. The entry is left in the hash
  229. * table since it is likely to be reused but we also bump nfree to indicate
  230. * that the entry can be reallocated for a different neighbor. We also drop
  231. * the existing neighbor reference in case the neighbor is going away and is
  232. * waiting on our reference.
  233. *
  234. * Because entries can be reallocated to other neighbors once their ref count
  235. * drops to 0 we need to take the entry's lock to avoid races with a new
  236. * incarnation.
  237. */
  238. void t3_l2e_free(struct l2t_data *d, struct l2t_entry *e)
  239. {
  240. spin_lock_bh(&e->lock);
  241. if (atomic_read(&e->refcnt) == 0) { /* hasn't been recycled */
  242. if (e->neigh) {
  243. neigh_release(e->neigh);
  244. e->neigh = NULL;
  245. }
  246. }
  247. spin_unlock_bh(&e->lock);
  248. atomic_inc(&d->nfree);
  249. }
  250. EXPORT_SYMBOL(t3_l2e_free);
  251. /*
  252. * Update an L2T entry that was previously used for the same next hop as neigh.
  253. * Must be called with softirqs disabled.
  254. */
  255. static inline void reuse_entry(struct l2t_entry *e, struct neighbour *neigh)
  256. {
  257. unsigned int nud_state;
  258. spin_lock(&e->lock); /* avoid race with t3_l2t_free */
  259. if (neigh != e->neigh)
  260. neigh_replace(e, neigh);
  261. nud_state = neigh->nud_state;
  262. if (memcmp(e->dmac, neigh->ha, sizeof(e->dmac)) ||
  263. !(nud_state & NUD_VALID))
  264. e->state = L2T_STATE_RESOLVING;
  265. else if (nud_state & NUD_CONNECTED)
  266. e->state = L2T_STATE_VALID;
  267. else
  268. e->state = L2T_STATE_STALE;
  269. spin_unlock(&e->lock);
  270. }
  271. struct l2t_entry *t3_l2t_get(struct t3cdev *cdev, struct dst_entry *dst,
  272. struct net_device *dev)
  273. {
  274. struct l2t_entry *e = NULL;
  275. struct neighbour *neigh;
  276. struct port_info *p;
  277. struct l2t_data *d;
  278. int hash;
  279. u32 addr;
  280. int ifidx;
  281. int smt_idx;
  282. rcu_read_lock();
  283. neigh = dst_get_neighbour_noref(dst);
  284. if (!neigh)
  285. goto done_rcu;
  286. addr = *(u32 *) neigh->primary_key;
  287. ifidx = neigh->dev->ifindex;
  288. if (!dev)
  289. dev = neigh->dev;
  290. p = netdev_priv(dev);
  291. smt_idx = p->port_id;
  292. d = L2DATA(cdev);
  293. if (!d)
  294. goto done_rcu;
  295. hash = arp_hash(addr, ifidx, d);
  296. write_lock_bh(&d->lock);
  297. for (e = d->l2tab[hash].first; e; e = e->next)
  298. if (e->addr == addr && e->ifindex == ifidx &&
  299. e->smt_idx == smt_idx) {
  300. l2t_hold(d, e);
  301. if (atomic_read(&e->refcnt) == 1)
  302. reuse_entry(e, neigh);
  303. goto done_unlock;
  304. }
  305. /* Need to allocate a new entry */
  306. e = alloc_l2e(d);
  307. if (e) {
  308. spin_lock(&e->lock); /* avoid race with t3_l2t_free */
  309. e->next = d->l2tab[hash].first;
  310. d->l2tab[hash].first = e;
  311. e->state = L2T_STATE_RESOLVING;
  312. e->addr = addr;
  313. e->ifindex = ifidx;
  314. e->smt_idx = smt_idx;
  315. atomic_set(&e->refcnt, 1);
  316. neigh_replace(e, neigh);
  317. if (neigh->dev->priv_flags & IFF_802_1Q_VLAN)
  318. e->vlan = vlan_dev_vlan_id(neigh->dev);
  319. else
  320. e->vlan = VLAN_NONE;
  321. spin_unlock(&e->lock);
  322. }
  323. done_unlock:
  324. write_unlock_bh(&d->lock);
  325. done_rcu:
  326. rcu_read_unlock();
  327. return e;
  328. }
  329. EXPORT_SYMBOL(t3_l2t_get);
  330. /*
  331. * Called when address resolution fails for an L2T entry to handle packets
  332. * on the arpq head. If a packet specifies a failure handler it is invoked,
  333. * otherwise the packets is sent to the offload device.
  334. *
  335. * XXX: maybe we should abandon the latter behavior and just require a failure
  336. * handler.
  337. */
  338. static void handle_failed_resolution(struct t3cdev *dev, struct sk_buff_head *arpq)
  339. {
  340. struct sk_buff *skb, *tmp;
  341. skb_queue_walk_safe(arpq, skb, tmp) {
  342. struct l2t_skb_cb *cb = L2T_SKB_CB(skb);
  343. __skb_unlink(skb, arpq);
  344. if (cb->arp_failure_handler)
  345. cb->arp_failure_handler(dev, skb);
  346. else
  347. cxgb3_ofld_send(dev, skb);
  348. }
  349. }
  350. /*
  351. * Called when the host's ARP layer makes a change to some entry that is
  352. * loaded into the HW L2 table.
  353. */
  354. void t3_l2t_update(struct t3cdev *dev, struct neighbour *neigh)
  355. {
  356. struct sk_buff_head arpq;
  357. struct l2t_entry *e;
  358. struct l2t_data *d = L2DATA(dev);
  359. u32 addr = *(u32 *) neigh->primary_key;
  360. int ifidx = neigh->dev->ifindex;
  361. int hash = arp_hash(addr, ifidx, d);
  362. read_lock_bh(&d->lock);
  363. for (e = d->l2tab[hash].first; e; e = e->next)
  364. if (e->addr == addr && e->ifindex == ifidx) {
  365. spin_lock(&e->lock);
  366. goto found;
  367. }
  368. read_unlock_bh(&d->lock);
  369. return;
  370. found:
  371. __skb_queue_head_init(&arpq);
  372. read_unlock(&d->lock);
  373. if (atomic_read(&e->refcnt)) {
  374. if (neigh != e->neigh)
  375. neigh_replace(e, neigh);
  376. if (e->state == L2T_STATE_RESOLVING) {
  377. if (neigh->nud_state & NUD_FAILED) {
  378. skb_queue_splice_init(&e->arpq, &arpq);
  379. } else if (neigh->nud_state & (NUD_CONNECTED|NUD_STALE))
  380. setup_l2e_send_pending(dev, NULL, e);
  381. } else {
  382. e->state = neigh->nud_state & NUD_CONNECTED ?
  383. L2T_STATE_VALID : L2T_STATE_STALE;
  384. if (memcmp(e->dmac, neigh->ha, 6))
  385. setup_l2e_send_pending(dev, NULL, e);
  386. }
  387. }
  388. spin_unlock_bh(&e->lock);
  389. if (!skb_queue_empty(&arpq))
  390. handle_failed_resolution(dev, &arpq);
  391. }
  392. struct l2t_data *t3_init_l2t(unsigned int l2t_capacity)
  393. {
  394. struct l2t_data *d;
  395. int i, size = sizeof(*d) + l2t_capacity * sizeof(struct l2t_entry);
  396. d = cxgb_alloc_mem(size);
  397. if (!d)
  398. return NULL;
  399. d->nentries = l2t_capacity;
  400. d->rover = &d->l2tab[1]; /* entry 0 is not used */
  401. atomic_set(&d->nfree, l2t_capacity - 1);
  402. rwlock_init(&d->lock);
  403. for (i = 0; i < l2t_capacity; ++i) {
  404. d->l2tab[i].idx = i;
  405. d->l2tab[i].state = L2T_STATE_UNUSED;
  406. __skb_queue_head_init(&d->l2tab[i].arpq);
  407. spin_lock_init(&d->l2tab[i].lock);
  408. atomic_set(&d->l2tab[i].refcnt, 0);
  409. }
  410. return d;
  411. }
  412. void t3_free_l2t(struct l2t_data *d)
  413. {
  414. cxgb_free_mem(d);
  415. }