mon_client.c 24 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028
  1. #include <linux/ceph/ceph_debug.h>
  2. #include <linux/module.h>
  3. #include <linux/types.h>
  4. #include <linux/slab.h>
  5. #include <linux/random.h>
  6. #include <linux/sched.h>
  7. #include <linux/ceph/mon_client.h>
  8. #include <linux/ceph/libceph.h>
  9. #include <linux/ceph/decode.h>
  10. #include <linux/ceph/auth.h>
  11. /*
  12. * Interact with Ceph monitor cluster. Handle requests for new map
  13. * versions, and periodically resend as needed. Also implement
  14. * statfs() and umount().
  15. *
  16. * A small cluster of Ceph "monitors" are responsible for managing critical
  17. * cluster configuration and state information. An odd number (e.g., 3, 5)
  18. * of cmon daemons use a modified version of the Paxos part-time parliament
  19. * algorithm to manage the MDS map (mds cluster membership), OSD map, and
  20. * list of clients who have mounted the file system.
  21. *
  22. * We maintain an open, active session with a monitor at all times in order to
  23. * receive timely MDSMap updates. We periodically send a keepalive byte on the
  24. * TCP socket to ensure we detect a failure. If the connection does break, we
  25. * randomly hunt for a new monitor. Once the connection is reestablished, we
  26. * resend any outstanding requests.
  27. */
  28. static const struct ceph_connection_operations mon_con_ops;
  29. static int __validate_auth(struct ceph_mon_client *monc);
  30. /*
  31. * Decode a monmap blob (e.g., during mount).
  32. */
  33. struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
  34. {
  35. struct ceph_monmap *m = NULL;
  36. int i, err = -EINVAL;
  37. struct ceph_fsid fsid;
  38. u32 epoch, num_mon;
  39. u16 version;
  40. u32 len;
  41. ceph_decode_32_safe(&p, end, len, bad);
  42. ceph_decode_need(&p, end, len, bad);
  43. dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
  44. ceph_decode_16_safe(&p, end, version, bad);
  45. ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
  46. ceph_decode_copy(&p, &fsid, sizeof(fsid));
  47. epoch = ceph_decode_32(&p);
  48. num_mon = ceph_decode_32(&p);
  49. ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
  50. if (num_mon >= CEPH_MAX_MON)
  51. goto bad;
  52. m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
  53. if (m == NULL)
  54. return ERR_PTR(-ENOMEM);
  55. m->fsid = fsid;
  56. m->epoch = epoch;
  57. m->num_mon = num_mon;
  58. ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
  59. for (i = 0; i < num_mon; i++)
  60. ceph_decode_addr(&m->mon_inst[i].addr);
  61. dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
  62. m->num_mon);
  63. for (i = 0; i < m->num_mon; i++)
  64. dout("monmap_decode mon%d is %s\n", i,
  65. ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
  66. return m;
  67. bad:
  68. dout("monmap_decode failed with %d\n", err);
  69. kfree(m);
  70. return ERR_PTR(err);
  71. }
  72. /*
  73. * return true if *addr is included in the monmap.
  74. */
  75. int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
  76. {
  77. int i;
  78. for (i = 0; i < m->num_mon; i++)
  79. if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
  80. return 1;
  81. return 0;
  82. }
  83. /*
  84. * Send an auth request.
  85. */
  86. static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
  87. {
  88. monc->pending_auth = 1;
  89. monc->m_auth->front.iov_len = len;
  90. monc->m_auth->hdr.front_len = cpu_to_le32(len);
  91. ceph_con_revoke(monc->con, monc->m_auth);
  92. ceph_msg_get(monc->m_auth); /* keep our ref */
  93. ceph_con_send(monc->con, monc->m_auth);
  94. }
  95. /*
  96. * Close monitor session, if any.
  97. */
  98. static void __close_session(struct ceph_mon_client *monc)
  99. {
  100. if (monc->con) {
  101. dout("__close_session closing mon%d\n", monc->cur_mon);
  102. ceph_con_revoke(monc->con, monc->m_auth);
  103. ceph_con_close(monc->con);
  104. monc->cur_mon = -1;
  105. monc->pending_auth = 0;
  106. ceph_auth_reset(monc->auth);
  107. }
  108. }
  109. /*
  110. * Open a session with a (new) monitor.
  111. */
  112. static int __open_session(struct ceph_mon_client *monc)
  113. {
  114. char r;
  115. int ret;
  116. if (monc->cur_mon < 0) {
  117. get_random_bytes(&r, 1);
  118. monc->cur_mon = r % monc->monmap->num_mon;
  119. dout("open_session num=%d r=%d -> mon%d\n",
  120. monc->monmap->num_mon, r, monc->cur_mon);
  121. monc->sub_sent = 0;
  122. monc->sub_renew_after = jiffies; /* i.e., expired */
  123. monc->want_next_osdmap = !!monc->want_next_osdmap;
  124. dout("open_session mon%d opening\n", monc->cur_mon);
  125. monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
  126. monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
  127. ceph_con_open(monc->con,
  128. &monc->monmap->mon_inst[monc->cur_mon].addr);
  129. /* initiatiate authentication handshake */
  130. ret = ceph_auth_build_hello(monc->auth,
  131. monc->m_auth->front.iov_base,
  132. monc->m_auth->front_max);
  133. __send_prepared_auth_request(monc, ret);
  134. } else {
  135. dout("open_session mon%d already open\n", monc->cur_mon);
  136. }
  137. return 0;
  138. }
  139. static bool __sub_expired(struct ceph_mon_client *monc)
  140. {
  141. return time_after_eq(jiffies, monc->sub_renew_after);
  142. }
  143. /*
  144. * Reschedule delayed work timer.
  145. */
  146. static void __schedule_delayed(struct ceph_mon_client *monc)
  147. {
  148. unsigned delay;
  149. if (monc->cur_mon < 0 || __sub_expired(monc))
  150. delay = 10 * HZ;
  151. else
  152. delay = 20 * HZ;
  153. dout("__schedule_delayed after %u\n", delay);
  154. schedule_delayed_work(&monc->delayed_work, delay);
  155. }
  156. /*
  157. * Send subscribe request for mdsmap and/or osdmap.
  158. */
  159. static void __send_subscribe(struct ceph_mon_client *monc)
  160. {
  161. dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
  162. (unsigned)monc->sub_sent, __sub_expired(monc),
  163. monc->want_next_osdmap);
  164. if ((__sub_expired(monc) && !monc->sub_sent) ||
  165. monc->want_next_osdmap == 1) {
  166. struct ceph_msg *msg = monc->m_subscribe;
  167. struct ceph_mon_subscribe_item *i;
  168. void *p, *end;
  169. int num;
  170. p = msg->front.iov_base;
  171. end = p + msg->front_max;
  172. num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
  173. ceph_encode_32(&p, num);
  174. if (monc->want_next_osdmap) {
  175. dout("__send_subscribe to 'osdmap' %u\n",
  176. (unsigned)monc->have_osdmap);
  177. ceph_encode_string(&p, end, "osdmap", 6);
  178. i = p;
  179. i->have = cpu_to_le64(monc->have_osdmap);
  180. i->onetime = 1;
  181. p += sizeof(*i);
  182. monc->want_next_osdmap = 2; /* requested */
  183. }
  184. if (monc->want_mdsmap) {
  185. dout("__send_subscribe to 'mdsmap' %u+\n",
  186. (unsigned)monc->have_mdsmap);
  187. ceph_encode_string(&p, end, "mdsmap", 6);
  188. i = p;
  189. i->have = cpu_to_le64(monc->have_mdsmap);
  190. i->onetime = 0;
  191. p += sizeof(*i);
  192. }
  193. ceph_encode_string(&p, end, "monmap", 6);
  194. i = p;
  195. i->have = 0;
  196. i->onetime = 0;
  197. p += sizeof(*i);
  198. msg->front.iov_len = p - msg->front.iov_base;
  199. msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
  200. ceph_con_revoke(monc->con, msg);
  201. ceph_con_send(monc->con, ceph_msg_get(msg));
  202. monc->sub_sent = jiffies | 1; /* never 0 */
  203. }
  204. }
  205. static void handle_subscribe_ack(struct ceph_mon_client *monc,
  206. struct ceph_msg *msg)
  207. {
  208. unsigned seconds;
  209. struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
  210. if (msg->front.iov_len < sizeof(*h))
  211. goto bad;
  212. seconds = le32_to_cpu(h->duration);
  213. mutex_lock(&monc->mutex);
  214. if (monc->hunting) {
  215. pr_info("mon%d %s session established\n",
  216. monc->cur_mon,
  217. ceph_pr_addr(&monc->con->peer_addr.in_addr));
  218. monc->hunting = false;
  219. }
  220. dout("handle_subscribe_ack after %d seconds\n", seconds);
  221. monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
  222. monc->sub_sent = 0;
  223. mutex_unlock(&monc->mutex);
  224. return;
  225. bad:
  226. pr_err("got corrupt subscribe-ack msg\n");
  227. ceph_msg_dump(msg);
  228. }
  229. /*
  230. * Keep track of which maps we have
  231. */
  232. int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
  233. {
  234. mutex_lock(&monc->mutex);
  235. monc->have_mdsmap = got;
  236. mutex_unlock(&monc->mutex);
  237. return 0;
  238. }
  239. EXPORT_SYMBOL(ceph_monc_got_mdsmap);
  240. int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
  241. {
  242. mutex_lock(&monc->mutex);
  243. monc->have_osdmap = got;
  244. monc->want_next_osdmap = 0;
  245. mutex_unlock(&monc->mutex);
  246. return 0;
  247. }
  248. /*
  249. * Register interest in the next osdmap
  250. */
  251. void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
  252. {
  253. dout("request_next_osdmap have %u\n", monc->have_osdmap);
  254. mutex_lock(&monc->mutex);
  255. if (!monc->want_next_osdmap)
  256. monc->want_next_osdmap = 1;
  257. if (monc->want_next_osdmap < 2)
  258. __send_subscribe(monc);
  259. mutex_unlock(&monc->mutex);
  260. }
  261. /*
  262. *
  263. */
  264. int ceph_monc_open_session(struct ceph_mon_client *monc)
  265. {
  266. if (!monc->con) {
  267. monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
  268. if (!monc->con)
  269. return -ENOMEM;
  270. ceph_con_init(monc->client->msgr, monc->con);
  271. monc->con->private = monc;
  272. monc->con->ops = &mon_con_ops;
  273. }
  274. mutex_lock(&monc->mutex);
  275. __open_session(monc);
  276. __schedule_delayed(monc);
  277. mutex_unlock(&monc->mutex);
  278. return 0;
  279. }
  280. EXPORT_SYMBOL(ceph_monc_open_session);
  281. /*
  282. * The monitor responds with mount ack indicate mount success. The
  283. * included client ticket allows the client to talk to MDSs and OSDs.
  284. */
  285. static void ceph_monc_handle_map(struct ceph_mon_client *monc,
  286. struct ceph_msg *msg)
  287. {
  288. struct ceph_client *client = monc->client;
  289. struct ceph_monmap *monmap = NULL, *old = monc->monmap;
  290. void *p, *end;
  291. mutex_lock(&monc->mutex);
  292. dout("handle_monmap\n");
  293. p = msg->front.iov_base;
  294. end = p + msg->front.iov_len;
  295. monmap = ceph_monmap_decode(p, end);
  296. if (IS_ERR(monmap)) {
  297. pr_err("problem decoding monmap, %d\n",
  298. (int)PTR_ERR(monmap));
  299. goto out;
  300. }
  301. if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
  302. kfree(monmap);
  303. goto out;
  304. }
  305. client->monc.monmap = monmap;
  306. kfree(old);
  307. out:
  308. mutex_unlock(&monc->mutex);
  309. wake_up_all(&client->auth_wq);
  310. }
  311. /*
  312. * generic requests (e.g., statfs, poolop)
  313. */
  314. static struct ceph_mon_generic_request *__lookup_generic_req(
  315. struct ceph_mon_client *monc, u64 tid)
  316. {
  317. struct ceph_mon_generic_request *req;
  318. struct rb_node *n = monc->generic_request_tree.rb_node;
  319. while (n) {
  320. req = rb_entry(n, struct ceph_mon_generic_request, node);
  321. if (tid < req->tid)
  322. n = n->rb_left;
  323. else if (tid > req->tid)
  324. n = n->rb_right;
  325. else
  326. return req;
  327. }
  328. return NULL;
  329. }
  330. static void __insert_generic_request(struct ceph_mon_client *monc,
  331. struct ceph_mon_generic_request *new)
  332. {
  333. struct rb_node **p = &monc->generic_request_tree.rb_node;
  334. struct rb_node *parent = NULL;
  335. struct ceph_mon_generic_request *req = NULL;
  336. while (*p) {
  337. parent = *p;
  338. req = rb_entry(parent, struct ceph_mon_generic_request, node);
  339. if (new->tid < req->tid)
  340. p = &(*p)->rb_left;
  341. else if (new->tid > req->tid)
  342. p = &(*p)->rb_right;
  343. else
  344. BUG();
  345. }
  346. rb_link_node(&new->node, parent, p);
  347. rb_insert_color(&new->node, &monc->generic_request_tree);
  348. }
  349. static void release_generic_request(struct kref *kref)
  350. {
  351. struct ceph_mon_generic_request *req =
  352. container_of(kref, struct ceph_mon_generic_request, kref);
  353. if (req->reply)
  354. ceph_msg_put(req->reply);
  355. if (req->request)
  356. ceph_msg_put(req->request);
  357. kfree(req);
  358. }
  359. static void put_generic_request(struct ceph_mon_generic_request *req)
  360. {
  361. kref_put(&req->kref, release_generic_request);
  362. }
  363. static void get_generic_request(struct ceph_mon_generic_request *req)
  364. {
  365. kref_get(&req->kref);
  366. }
  367. static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
  368. struct ceph_msg_header *hdr,
  369. int *skip)
  370. {
  371. struct ceph_mon_client *monc = con->private;
  372. struct ceph_mon_generic_request *req;
  373. u64 tid = le64_to_cpu(hdr->tid);
  374. struct ceph_msg *m;
  375. mutex_lock(&monc->mutex);
  376. req = __lookup_generic_req(monc, tid);
  377. if (!req) {
  378. dout("get_generic_reply %lld dne\n", tid);
  379. *skip = 1;
  380. m = NULL;
  381. } else {
  382. dout("get_generic_reply %lld got %p\n", tid, req->reply);
  383. m = ceph_msg_get(req->reply);
  384. /*
  385. * we don't need to track the connection reading into
  386. * this reply because we only have one open connection
  387. * at a time, ever.
  388. */
  389. }
  390. mutex_unlock(&monc->mutex);
  391. return m;
  392. }
  393. static int do_generic_request(struct ceph_mon_client *monc,
  394. struct ceph_mon_generic_request *req)
  395. {
  396. int err;
  397. /* register request */
  398. mutex_lock(&monc->mutex);
  399. req->tid = ++monc->last_tid;
  400. req->request->hdr.tid = cpu_to_le64(req->tid);
  401. __insert_generic_request(monc, req);
  402. monc->num_generic_requests++;
  403. ceph_con_send(monc->con, ceph_msg_get(req->request));
  404. mutex_unlock(&monc->mutex);
  405. err = wait_for_completion_interruptible(&req->completion);
  406. mutex_lock(&monc->mutex);
  407. rb_erase(&req->node, &monc->generic_request_tree);
  408. monc->num_generic_requests--;
  409. mutex_unlock(&monc->mutex);
  410. if (!err)
  411. err = req->result;
  412. return err;
  413. }
  414. /*
  415. * statfs
  416. */
  417. static void handle_statfs_reply(struct ceph_mon_client *monc,
  418. struct ceph_msg *msg)
  419. {
  420. struct ceph_mon_generic_request *req;
  421. struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
  422. u64 tid = le64_to_cpu(msg->hdr.tid);
  423. if (msg->front.iov_len != sizeof(*reply))
  424. goto bad;
  425. dout("handle_statfs_reply %p tid %llu\n", msg, tid);
  426. mutex_lock(&monc->mutex);
  427. req = __lookup_generic_req(monc, tid);
  428. if (req) {
  429. *(struct ceph_statfs *)req->buf = reply->st;
  430. req->result = 0;
  431. get_generic_request(req);
  432. }
  433. mutex_unlock(&monc->mutex);
  434. if (req) {
  435. complete_all(&req->completion);
  436. put_generic_request(req);
  437. }
  438. return;
  439. bad:
  440. pr_err("corrupt generic reply, tid %llu\n", tid);
  441. ceph_msg_dump(msg);
  442. }
  443. /*
  444. * Do a synchronous statfs().
  445. */
  446. int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
  447. {
  448. struct ceph_mon_generic_request *req;
  449. struct ceph_mon_statfs *h;
  450. int err;
  451. req = kzalloc(sizeof(*req), GFP_NOFS);
  452. if (!req)
  453. return -ENOMEM;
  454. kref_init(&req->kref);
  455. req->buf = buf;
  456. req->buf_len = sizeof(*buf);
  457. init_completion(&req->completion);
  458. err = -ENOMEM;
  459. req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
  460. if (!req->request)
  461. goto out;
  462. req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
  463. if (!req->reply)
  464. goto out;
  465. /* fill out request */
  466. h = req->request->front.iov_base;
  467. h->monhdr.have_version = 0;
  468. h->monhdr.session_mon = cpu_to_le16(-1);
  469. h->monhdr.session_mon_tid = 0;
  470. h->fsid = monc->monmap->fsid;
  471. err = do_generic_request(monc, req);
  472. out:
  473. kref_put(&req->kref, release_generic_request);
  474. return err;
  475. }
  476. EXPORT_SYMBOL(ceph_monc_do_statfs);
  477. /*
  478. * pool ops
  479. */
  480. static int get_poolop_reply_buf(const char *src, size_t src_len,
  481. char *dst, size_t dst_len)
  482. {
  483. u32 buf_len;
  484. if (src_len != sizeof(u32) + dst_len)
  485. return -EINVAL;
  486. buf_len = le32_to_cpu(*(u32 *)src);
  487. if (buf_len != dst_len)
  488. return -EINVAL;
  489. memcpy(dst, src + sizeof(u32), dst_len);
  490. return 0;
  491. }
  492. static void handle_poolop_reply(struct ceph_mon_client *monc,
  493. struct ceph_msg *msg)
  494. {
  495. struct ceph_mon_generic_request *req;
  496. struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
  497. u64 tid = le64_to_cpu(msg->hdr.tid);
  498. if (msg->front.iov_len < sizeof(*reply))
  499. goto bad;
  500. dout("handle_poolop_reply %p tid %llu\n", msg, tid);
  501. mutex_lock(&monc->mutex);
  502. req = __lookup_generic_req(monc, tid);
  503. if (req) {
  504. if (req->buf_len &&
  505. get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
  506. msg->front.iov_len - sizeof(*reply),
  507. req->buf, req->buf_len) < 0) {
  508. mutex_unlock(&monc->mutex);
  509. goto bad;
  510. }
  511. req->result = le32_to_cpu(reply->reply_code);
  512. get_generic_request(req);
  513. }
  514. mutex_unlock(&monc->mutex);
  515. if (req) {
  516. complete(&req->completion);
  517. put_generic_request(req);
  518. }
  519. return;
  520. bad:
  521. pr_err("corrupt generic reply, tid %llu\n", tid);
  522. ceph_msg_dump(msg);
  523. }
  524. /*
  525. * Do a synchronous pool op.
  526. */
  527. int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
  528. u32 pool, u64 snapid,
  529. char *buf, int len)
  530. {
  531. struct ceph_mon_generic_request *req;
  532. struct ceph_mon_poolop *h;
  533. int err;
  534. req = kzalloc(sizeof(*req), GFP_NOFS);
  535. if (!req)
  536. return -ENOMEM;
  537. kref_init(&req->kref);
  538. req->buf = buf;
  539. req->buf_len = len;
  540. init_completion(&req->completion);
  541. err = -ENOMEM;
  542. req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
  543. if (!req->request)
  544. goto out;
  545. req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
  546. if (!req->reply)
  547. goto out;
  548. /* fill out request */
  549. req->request->hdr.version = cpu_to_le16(2);
  550. h = req->request->front.iov_base;
  551. h->monhdr.have_version = 0;
  552. h->monhdr.session_mon = cpu_to_le16(-1);
  553. h->monhdr.session_mon_tid = 0;
  554. h->fsid = monc->monmap->fsid;
  555. h->pool = cpu_to_le32(pool);
  556. h->op = cpu_to_le32(op);
  557. h->auid = 0;
  558. h->snapid = cpu_to_le64(snapid);
  559. h->name_len = 0;
  560. err = do_generic_request(monc, req);
  561. out:
  562. kref_put(&req->kref, release_generic_request);
  563. return err;
  564. }
  565. int ceph_monc_create_snapid(struct ceph_mon_client *monc,
  566. u32 pool, u64 *snapid)
  567. {
  568. return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
  569. pool, 0, (char *)snapid, sizeof(*snapid));
  570. }
  571. EXPORT_SYMBOL(ceph_monc_create_snapid);
  572. int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
  573. u32 pool, u64 snapid)
  574. {
  575. return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
  576. pool, snapid, 0, 0);
  577. }
  578. /*
  579. * Resend pending generic requests.
  580. */
  581. static void __resend_generic_request(struct ceph_mon_client *monc)
  582. {
  583. struct ceph_mon_generic_request *req;
  584. struct rb_node *p;
  585. for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
  586. req = rb_entry(p, struct ceph_mon_generic_request, node);
  587. ceph_con_revoke(monc->con, req->request);
  588. ceph_con_send(monc->con, ceph_msg_get(req->request));
  589. }
  590. }
  591. /*
  592. * Delayed work. If we haven't mounted yet, retry. Otherwise,
  593. * renew/retry subscription as needed (in case it is timing out, or we
  594. * got an ENOMEM). And keep the monitor connection alive.
  595. */
  596. static void delayed_work(struct work_struct *work)
  597. {
  598. struct ceph_mon_client *monc =
  599. container_of(work, struct ceph_mon_client, delayed_work.work);
  600. dout("monc delayed_work\n");
  601. mutex_lock(&monc->mutex);
  602. if (monc->hunting) {
  603. __close_session(monc);
  604. __open_session(monc); /* continue hunting */
  605. } else {
  606. ceph_con_keepalive(monc->con);
  607. __validate_auth(monc);
  608. if (monc->auth->ops->is_authenticated(monc->auth))
  609. __send_subscribe(monc);
  610. }
  611. __schedule_delayed(monc);
  612. mutex_unlock(&monc->mutex);
  613. }
  614. /*
  615. * On startup, we build a temporary monmap populated with the IPs
  616. * provided by mount(2).
  617. */
  618. static int build_initial_monmap(struct ceph_mon_client *monc)
  619. {
  620. struct ceph_options *opt = monc->client->options;
  621. struct ceph_entity_addr *mon_addr = opt->mon_addr;
  622. int num_mon = opt->num_mon;
  623. int i;
  624. /* build initial monmap */
  625. monc->monmap = kzalloc(sizeof(*monc->monmap) +
  626. num_mon*sizeof(monc->monmap->mon_inst[0]),
  627. GFP_KERNEL);
  628. if (!monc->monmap)
  629. return -ENOMEM;
  630. for (i = 0; i < num_mon; i++) {
  631. monc->monmap->mon_inst[i].addr = mon_addr[i];
  632. monc->monmap->mon_inst[i].addr.nonce = 0;
  633. monc->monmap->mon_inst[i].name.type =
  634. CEPH_ENTITY_TYPE_MON;
  635. monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
  636. }
  637. monc->monmap->num_mon = num_mon;
  638. monc->have_fsid = false;
  639. return 0;
  640. }
  641. int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
  642. {
  643. int err = 0;
  644. dout("init\n");
  645. memset(monc, 0, sizeof(*monc));
  646. monc->client = cl;
  647. monc->monmap = NULL;
  648. mutex_init(&monc->mutex);
  649. err = build_initial_monmap(monc);
  650. if (err)
  651. goto out;
  652. monc->con = NULL;
  653. /* authentication */
  654. monc->auth = ceph_auth_init(cl->options->name,
  655. cl->options->key);
  656. if (IS_ERR(monc->auth))
  657. return PTR_ERR(monc->auth);
  658. monc->auth->want_keys =
  659. CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
  660. CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
  661. /* msgs */
  662. err = -ENOMEM;
  663. monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
  664. sizeof(struct ceph_mon_subscribe_ack),
  665. GFP_NOFS);
  666. if (!monc->m_subscribe_ack)
  667. goto out_monmap;
  668. monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
  669. if (!monc->m_subscribe)
  670. goto out_subscribe_ack;
  671. monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
  672. if (!monc->m_auth_reply)
  673. goto out_subscribe;
  674. monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
  675. monc->pending_auth = 0;
  676. if (!monc->m_auth)
  677. goto out_auth_reply;
  678. monc->cur_mon = -1;
  679. monc->hunting = true;
  680. monc->sub_renew_after = jiffies;
  681. monc->sub_sent = 0;
  682. INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
  683. monc->generic_request_tree = RB_ROOT;
  684. monc->num_generic_requests = 0;
  685. monc->last_tid = 0;
  686. monc->have_mdsmap = 0;
  687. monc->have_osdmap = 0;
  688. monc->want_next_osdmap = 1;
  689. return 0;
  690. out_auth_reply:
  691. ceph_msg_put(monc->m_auth_reply);
  692. out_subscribe:
  693. ceph_msg_put(monc->m_subscribe);
  694. out_subscribe_ack:
  695. ceph_msg_put(monc->m_subscribe_ack);
  696. out_monmap:
  697. kfree(monc->monmap);
  698. out:
  699. return err;
  700. }
  701. EXPORT_SYMBOL(ceph_monc_init);
  702. void ceph_monc_stop(struct ceph_mon_client *monc)
  703. {
  704. dout("stop\n");
  705. cancel_delayed_work_sync(&monc->delayed_work);
  706. mutex_lock(&monc->mutex);
  707. __close_session(monc);
  708. if (monc->con) {
  709. monc->con->private = NULL;
  710. monc->con->ops->put(monc->con);
  711. monc->con = NULL;
  712. }
  713. mutex_unlock(&monc->mutex);
  714. ceph_auth_destroy(monc->auth);
  715. ceph_msg_put(monc->m_auth);
  716. ceph_msg_put(monc->m_auth_reply);
  717. ceph_msg_put(monc->m_subscribe);
  718. ceph_msg_put(monc->m_subscribe_ack);
  719. kfree(monc->monmap);
  720. }
  721. EXPORT_SYMBOL(ceph_monc_stop);
  722. static void handle_auth_reply(struct ceph_mon_client *monc,
  723. struct ceph_msg *msg)
  724. {
  725. int ret;
  726. int was_auth = 0;
  727. mutex_lock(&monc->mutex);
  728. if (monc->auth->ops)
  729. was_auth = monc->auth->ops->is_authenticated(monc->auth);
  730. monc->pending_auth = 0;
  731. ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
  732. msg->front.iov_len,
  733. monc->m_auth->front.iov_base,
  734. monc->m_auth->front_max);
  735. if (ret < 0) {
  736. monc->client->auth_err = ret;
  737. wake_up_all(&monc->client->auth_wq);
  738. } else if (ret > 0) {
  739. __send_prepared_auth_request(monc, ret);
  740. } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
  741. dout("authenticated, starting session\n");
  742. monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
  743. monc->client->msgr->inst.name.num =
  744. cpu_to_le64(monc->auth->global_id);
  745. __send_subscribe(monc);
  746. __resend_generic_request(monc);
  747. }
  748. mutex_unlock(&monc->mutex);
  749. }
  750. static int __validate_auth(struct ceph_mon_client *monc)
  751. {
  752. int ret;
  753. if (monc->pending_auth)
  754. return 0;
  755. ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
  756. monc->m_auth->front_max);
  757. if (ret <= 0)
  758. return ret; /* either an error, or no need to authenticate */
  759. __send_prepared_auth_request(monc, ret);
  760. return 0;
  761. }
  762. int ceph_monc_validate_auth(struct ceph_mon_client *monc)
  763. {
  764. int ret;
  765. mutex_lock(&monc->mutex);
  766. ret = __validate_auth(monc);
  767. mutex_unlock(&monc->mutex);
  768. return ret;
  769. }
  770. EXPORT_SYMBOL(ceph_monc_validate_auth);
  771. /*
  772. * handle incoming message
  773. */
  774. static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
  775. {
  776. struct ceph_mon_client *monc = con->private;
  777. int type = le16_to_cpu(msg->hdr.type);
  778. if (!monc)
  779. return;
  780. switch (type) {
  781. case CEPH_MSG_AUTH_REPLY:
  782. handle_auth_reply(monc, msg);
  783. break;
  784. case CEPH_MSG_MON_SUBSCRIBE_ACK:
  785. handle_subscribe_ack(monc, msg);
  786. break;
  787. case CEPH_MSG_STATFS_REPLY:
  788. handle_statfs_reply(monc, msg);
  789. break;
  790. case CEPH_MSG_POOLOP_REPLY:
  791. handle_poolop_reply(monc, msg);
  792. break;
  793. case CEPH_MSG_MON_MAP:
  794. ceph_monc_handle_map(monc, msg);
  795. break;
  796. case CEPH_MSG_OSD_MAP:
  797. ceph_osdc_handle_map(&monc->client->osdc, msg);
  798. break;
  799. default:
  800. /* can the chained handler handle it? */
  801. if (monc->client->extra_mon_dispatch &&
  802. monc->client->extra_mon_dispatch(monc->client, msg) == 0)
  803. break;
  804. pr_err("received unknown message type %d %s\n", type,
  805. ceph_msg_type_name(type));
  806. }
  807. ceph_msg_put(msg);
  808. }
  809. /*
  810. * Allocate memory for incoming message
  811. */
  812. static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
  813. struct ceph_msg_header *hdr,
  814. int *skip)
  815. {
  816. struct ceph_mon_client *monc = con->private;
  817. int type = le16_to_cpu(hdr->type);
  818. int front_len = le32_to_cpu(hdr->front_len);
  819. struct ceph_msg *m = NULL;
  820. *skip = 0;
  821. switch (type) {
  822. case CEPH_MSG_MON_SUBSCRIBE_ACK:
  823. m = ceph_msg_get(monc->m_subscribe_ack);
  824. break;
  825. case CEPH_MSG_POOLOP_REPLY:
  826. case CEPH_MSG_STATFS_REPLY:
  827. return get_generic_reply(con, hdr, skip);
  828. case CEPH_MSG_AUTH_REPLY:
  829. m = ceph_msg_get(monc->m_auth_reply);
  830. break;
  831. case CEPH_MSG_MON_MAP:
  832. case CEPH_MSG_MDS_MAP:
  833. case CEPH_MSG_OSD_MAP:
  834. m = ceph_msg_new(type, front_len, GFP_NOFS);
  835. break;
  836. }
  837. if (!m) {
  838. pr_info("alloc_msg unknown type %d\n", type);
  839. *skip = 1;
  840. }
  841. return m;
  842. }
  843. /*
  844. * If the monitor connection resets, pick a new monitor and resubmit
  845. * any pending requests.
  846. */
  847. static void mon_fault(struct ceph_connection *con)
  848. {
  849. struct ceph_mon_client *monc = con->private;
  850. if (!monc)
  851. return;
  852. dout("mon_fault\n");
  853. mutex_lock(&monc->mutex);
  854. if (!con->private)
  855. goto out;
  856. if (monc->con && !monc->hunting)
  857. pr_info("mon%d %s session lost, "
  858. "hunting for new mon\n", monc->cur_mon,
  859. ceph_pr_addr(&monc->con->peer_addr.in_addr));
  860. __close_session(monc);
  861. if (!monc->hunting) {
  862. /* start hunting */
  863. monc->hunting = true;
  864. __open_session(monc);
  865. } else {
  866. /* already hunting, let's wait a bit */
  867. __schedule_delayed(monc);
  868. }
  869. out:
  870. mutex_unlock(&monc->mutex);
  871. }
  872. static const struct ceph_connection_operations mon_con_ops = {
  873. .get = ceph_con_get,
  874. .put = ceph_con_put,
  875. .dispatch = dispatch,
  876. .fault = mon_fault,
  877. .alloc_msg = mon_alloc_msg,
  878. };