rcom.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577
  1. /******************************************************************************
  2. *******************************************************************************
  3. **
  4. ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
  5. ** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
  6. **
  7. ** This copyrighted material is made available to anyone wishing to use,
  8. ** modify, copy, or redistribute it subject to the terms and conditions
  9. ** of the GNU General Public License v.2.
  10. **
  11. *******************************************************************************
  12. ******************************************************************************/
  13. #include "dlm_internal.h"
  14. #include "lockspace.h"
  15. #include "member.h"
  16. #include "lowcomms.h"
  17. #include "midcomms.h"
  18. #include "rcom.h"
  19. #include "recover.h"
  20. #include "dir.h"
  21. #include "config.h"
  22. #include "memory.h"
  23. #include "lock.h"
  24. #include "util.h"
  25. #include "member.h"
  26. static int rcom_response(struct dlm_ls *ls)
  27. {
  28. return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
  29. }
  30. static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
  31. struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
  32. {
  33. struct dlm_rcom *rc;
  34. struct dlm_mhandle *mh;
  35. char *mb;
  36. int mb_len = sizeof(struct dlm_rcom) + len;
  37. mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
  38. if (!mh) {
  39. log_print("create_rcom to %d type %d len %d ENOBUFS",
  40. to_nodeid, type, len);
  41. return -ENOBUFS;
  42. }
  43. memset(mb, 0, mb_len);
  44. rc = (struct dlm_rcom *) mb;
  45. rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
  46. rc->rc_header.h_lockspace = ls->ls_global_id;
  47. rc->rc_header.h_nodeid = dlm_our_nodeid();
  48. rc->rc_header.h_length = mb_len;
  49. rc->rc_header.h_cmd = DLM_RCOM;
  50. rc->rc_type = type;
  51. spin_lock(&ls->ls_recover_lock);
  52. rc->rc_seq = ls->ls_recover_seq;
  53. spin_unlock(&ls->ls_recover_lock);
  54. *mh_ret = mh;
  55. *rc_ret = rc;
  56. return 0;
  57. }
  58. static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
  59. struct dlm_rcom *rc)
  60. {
  61. dlm_rcom_out(rc);
  62. dlm_lowcomms_commit_buffer(mh);
  63. }
  64. static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs,
  65. uint32_t flags)
  66. {
  67. rs->rs_flags = cpu_to_le32(flags);
  68. }
  69. /* When replying to a status request, a node also sends back its
  70. configuration values. The requesting node then checks that the remote
  71. node is configured the same way as itself. */
  72. static void set_rcom_config(struct dlm_ls *ls, struct rcom_config *rf,
  73. uint32_t num_slots)
  74. {
  75. rf->rf_lvblen = cpu_to_le32(ls->ls_lvblen);
  76. rf->rf_lsflags = cpu_to_le32(ls->ls_exflags);
  77. rf->rf_our_slot = cpu_to_le16(ls->ls_slot);
  78. rf->rf_num_slots = cpu_to_le16(num_slots);
  79. rf->rf_generation = cpu_to_le32(ls->ls_generation);
  80. }
  81. static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
  82. {
  83. struct rcom_config *rf = (struct rcom_config *) rc->rc_buf;
  84. if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) {
  85. log_error(ls, "version mismatch: %x nodeid %d: %x",
  86. DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid,
  87. rc->rc_header.h_version);
  88. return -EPROTO;
  89. }
  90. if (le32_to_cpu(rf->rf_lvblen) != ls->ls_lvblen ||
  91. le32_to_cpu(rf->rf_lsflags) != ls->ls_exflags) {
  92. log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
  93. ls->ls_lvblen, ls->ls_exflags, nodeid,
  94. le32_to_cpu(rf->rf_lvblen),
  95. le32_to_cpu(rf->rf_lsflags));
  96. return -EPROTO;
  97. }
  98. return 0;
  99. }
  100. static void allow_sync_reply(struct dlm_ls *ls, uint64_t *new_seq)
  101. {
  102. spin_lock(&ls->ls_rcom_spin);
  103. *new_seq = ++ls->ls_rcom_seq;
  104. set_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
  105. spin_unlock(&ls->ls_rcom_spin);
  106. }
  107. static void disallow_sync_reply(struct dlm_ls *ls)
  108. {
  109. spin_lock(&ls->ls_rcom_spin);
  110. clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
  111. clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
  112. spin_unlock(&ls->ls_rcom_spin);
  113. }
  114. /*
  115. * low nodeid gathers one slot value at a time from each node.
  116. * it sets need_slots=0, and saves rf_our_slot returned from each
  117. * rcom_config.
  118. *
  119. * other nodes gather all slot values at once from the low nodeid.
  120. * they set need_slots=1, and ignore the rf_our_slot returned from each
  121. * rcom_config. they use the rf_num_slots returned from the low
  122. * node's rcom_config.
  123. */
  124. int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
  125. {
  126. struct dlm_rcom *rc;
  127. struct dlm_mhandle *mh;
  128. int error = 0;
  129. ls->ls_recover_nodeid = nodeid;
  130. if (nodeid == dlm_our_nodeid()) {
  131. rc = ls->ls_recover_buf;
  132. rc->rc_result = dlm_recover_status(ls);
  133. goto out;
  134. }
  135. error = create_rcom(ls, nodeid, DLM_RCOM_STATUS,
  136. sizeof(struct rcom_status), &rc, &mh);
  137. if (error)
  138. goto out;
  139. set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags);
  140. allow_sync_reply(ls, &rc->rc_id);
  141. memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
  142. send_rcom(ls, mh, rc);
  143. error = dlm_wait_function(ls, &rcom_response);
  144. disallow_sync_reply(ls);
  145. if (error)
  146. goto out;
  147. rc = ls->ls_recover_buf;
  148. if (rc->rc_result == -ESRCH) {
  149. /* we pretend the remote lockspace exists with 0 status */
  150. log_debug(ls, "remote node %d not ready", nodeid);
  151. rc->rc_result = 0;
  152. error = 0;
  153. } else {
  154. error = check_rcom_config(ls, rc, nodeid);
  155. }
  156. /* the caller looks at rc_result for the remote recovery status */
  157. out:
  158. return error;
  159. }
  160. static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
  161. {
  162. struct dlm_rcom *rc;
  163. struct dlm_mhandle *mh;
  164. struct rcom_status *rs;
  165. uint32_t status;
  166. int nodeid = rc_in->rc_header.h_nodeid;
  167. int len = sizeof(struct rcom_config);
  168. int num_slots = 0;
  169. int error;
  170. if (!dlm_slots_version(&rc_in->rc_header)) {
  171. status = dlm_recover_status(ls);
  172. goto do_create;
  173. }
  174. rs = (struct rcom_status *)rc_in->rc_buf;
  175. if (!(rs->rs_flags & DLM_RSF_NEED_SLOTS)) {
  176. status = dlm_recover_status(ls);
  177. goto do_create;
  178. }
  179. spin_lock(&ls->ls_recover_lock);
  180. status = ls->ls_recover_status;
  181. num_slots = ls->ls_num_slots;
  182. spin_unlock(&ls->ls_recover_lock);
  183. len += num_slots * sizeof(struct rcom_slot);
  184. do_create:
  185. error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
  186. len, &rc, &mh);
  187. if (error)
  188. return;
  189. rc->rc_id = rc_in->rc_id;
  190. rc->rc_seq_reply = rc_in->rc_seq;
  191. rc->rc_result = status;
  192. set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, num_slots);
  193. if (!num_slots)
  194. goto do_send;
  195. spin_lock(&ls->ls_recover_lock);
  196. if (ls->ls_num_slots != num_slots) {
  197. spin_unlock(&ls->ls_recover_lock);
  198. log_debug(ls, "receive_rcom_status num_slots %d to %d",
  199. num_slots, ls->ls_num_slots);
  200. rc->rc_result = 0;
  201. set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, 0);
  202. goto do_send;
  203. }
  204. dlm_slots_copy_out(ls, rc);
  205. spin_unlock(&ls->ls_recover_lock);
  206. do_send:
  207. send_rcom(ls, mh, rc);
  208. }
  209. static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
  210. {
  211. spin_lock(&ls->ls_rcom_spin);
  212. if (!test_bit(LSFL_RCOM_WAIT, &ls->ls_flags) ||
  213. rc_in->rc_id != ls->ls_rcom_seq) {
  214. log_debug(ls, "reject reply %d from %d seq %llx expect %llx",
  215. rc_in->rc_type, rc_in->rc_header.h_nodeid,
  216. (unsigned long long)rc_in->rc_id,
  217. (unsigned long long)ls->ls_rcom_seq);
  218. goto out;
  219. }
  220. memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
  221. set_bit(LSFL_RCOM_READY, &ls->ls_flags);
  222. clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
  223. wake_up(&ls->ls_wait_general);
  224. out:
  225. spin_unlock(&ls->ls_rcom_spin);
  226. }
  227. int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
  228. {
  229. struct dlm_rcom *rc;
  230. struct dlm_mhandle *mh;
  231. int error = 0;
  232. int max_size = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom);
  233. ls->ls_recover_nodeid = nodeid;
  234. if (nodeid == dlm_our_nodeid()) {
  235. ls->ls_recover_buf->rc_header.h_length =
  236. dlm_config.ci_buffer_size;
  237. dlm_copy_master_names(ls, last_name, last_len,
  238. ls->ls_recover_buf->rc_buf,
  239. max_size, nodeid);
  240. goto out;
  241. }
  242. error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
  243. if (error)
  244. goto out;
  245. memcpy(rc->rc_buf, last_name, last_len);
  246. allow_sync_reply(ls, &rc->rc_id);
  247. memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
  248. send_rcom(ls, mh, rc);
  249. error = dlm_wait_function(ls, &rcom_response);
  250. disallow_sync_reply(ls);
  251. out:
  252. return error;
  253. }
  254. static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
  255. {
  256. struct dlm_rcom *rc;
  257. struct dlm_mhandle *mh;
  258. int error, inlen, outlen, nodeid;
  259. nodeid = rc_in->rc_header.h_nodeid;
  260. inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
  261. outlen = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom);
  262. error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
  263. if (error)
  264. return;
  265. rc->rc_id = rc_in->rc_id;
  266. rc->rc_seq_reply = rc_in->rc_seq;
  267. dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
  268. nodeid);
  269. send_rcom(ls, mh, rc);
  270. }
  271. int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
  272. {
  273. struct dlm_rcom *rc;
  274. struct dlm_mhandle *mh;
  275. struct dlm_ls *ls = r->res_ls;
  276. int error;
  277. error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
  278. &rc, &mh);
  279. if (error)
  280. goto out;
  281. memcpy(rc->rc_buf, r->res_name, r->res_length);
  282. rc->rc_id = (unsigned long) r;
  283. send_rcom(ls, mh, rc);
  284. out:
  285. return error;
  286. }
  287. static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
  288. {
  289. struct dlm_rcom *rc;
  290. struct dlm_mhandle *mh;
  291. int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
  292. int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
  293. error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
  294. if (error)
  295. return;
  296. error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
  297. if (error)
  298. ret_nodeid = error;
  299. rc->rc_result = ret_nodeid;
  300. rc->rc_id = rc_in->rc_id;
  301. rc->rc_seq_reply = rc_in->rc_seq;
  302. send_rcom(ls, mh, rc);
  303. }
  304. static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
  305. {
  306. dlm_recover_master_reply(ls, rc_in);
  307. }
  308. static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
  309. struct rcom_lock *rl)
  310. {
  311. memset(rl, 0, sizeof(*rl));
  312. rl->rl_ownpid = cpu_to_le32(lkb->lkb_ownpid);
  313. rl->rl_lkid = cpu_to_le32(lkb->lkb_id);
  314. rl->rl_exflags = cpu_to_le32(lkb->lkb_exflags);
  315. rl->rl_flags = cpu_to_le32(lkb->lkb_flags);
  316. rl->rl_lvbseq = cpu_to_le32(lkb->lkb_lvbseq);
  317. rl->rl_rqmode = lkb->lkb_rqmode;
  318. rl->rl_grmode = lkb->lkb_grmode;
  319. rl->rl_status = lkb->lkb_status;
  320. rl->rl_wait_type = cpu_to_le16(lkb->lkb_wait_type);
  321. if (lkb->lkb_bastfn)
  322. rl->rl_asts |= DLM_CB_BAST;
  323. if (lkb->lkb_astfn)
  324. rl->rl_asts |= DLM_CB_CAST;
  325. rl->rl_namelen = cpu_to_le16(r->res_length);
  326. memcpy(rl->rl_name, r->res_name, r->res_length);
  327. /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
  328. If so, receive_rcom_lock_args() won't take this copy. */
  329. if (lkb->lkb_lvbptr)
  330. memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
  331. }
  332. int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
  333. {
  334. struct dlm_ls *ls = r->res_ls;
  335. struct dlm_rcom *rc;
  336. struct dlm_mhandle *mh;
  337. struct rcom_lock *rl;
  338. int error, len = sizeof(struct rcom_lock);
  339. if (lkb->lkb_lvbptr)
  340. len += ls->ls_lvblen;
  341. error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
  342. if (error)
  343. goto out;
  344. rl = (struct rcom_lock *) rc->rc_buf;
  345. pack_rcom_lock(r, lkb, rl);
  346. rc->rc_id = (unsigned long) r;
  347. send_rcom(ls, mh, rc);
  348. out:
  349. return error;
  350. }
  351. /* needs at least dlm_rcom + rcom_lock */
  352. static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
  353. {
  354. struct dlm_rcom *rc;
  355. struct dlm_mhandle *mh;
  356. int error, nodeid = rc_in->rc_header.h_nodeid;
  357. dlm_recover_master_copy(ls, rc_in);
  358. error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
  359. sizeof(struct rcom_lock), &rc, &mh);
  360. if (error)
  361. return;
  362. /* We send back the same rcom_lock struct we received, but
  363. dlm_recover_master_copy() has filled in rl_remid and rl_result */
  364. memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
  365. rc->rc_id = rc_in->rc_id;
  366. rc->rc_seq_reply = rc_in->rc_seq;
  367. send_rcom(ls, mh, rc);
  368. }
  369. /* If the lockspace doesn't exist then still send a status message
  370. back; it's possible that it just doesn't have its global_id yet. */
  371. int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
  372. {
  373. struct dlm_rcom *rc;
  374. struct rcom_config *rf;
  375. struct dlm_mhandle *mh;
  376. char *mb;
  377. int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config);
  378. mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_NOFS, &mb);
  379. if (!mh)
  380. return -ENOBUFS;
  381. memset(mb, 0, mb_len);
  382. rc = (struct dlm_rcom *) mb;
  383. rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
  384. rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
  385. rc->rc_header.h_nodeid = dlm_our_nodeid();
  386. rc->rc_header.h_length = mb_len;
  387. rc->rc_header.h_cmd = DLM_RCOM;
  388. rc->rc_type = DLM_RCOM_STATUS_REPLY;
  389. rc->rc_id = rc_in->rc_id;
  390. rc->rc_seq_reply = rc_in->rc_seq;
  391. rc->rc_result = -ESRCH;
  392. rf = (struct rcom_config *) rc->rc_buf;
  393. rf->rf_lvblen = cpu_to_le32(~0U);
  394. dlm_rcom_out(rc);
  395. dlm_lowcomms_commit_buffer(mh);
  396. return 0;
  397. }
  398. static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
  399. {
  400. uint64_t seq;
  401. int rv = 0;
  402. switch (rc->rc_type) {
  403. case DLM_RCOM_STATUS_REPLY:
  404. case DLM_RCOM_NAMES_REPLY:
  405. case DLM_RCOM_LOOKUP_REPLY:
  406. case DLM_RCOM_LOCK_REPLY:
  407. spin_lock(&ls->ls_recover_lock);
  408. seq = ls->ls_recover_seq;
  409. spin_unlock(&ls->ls_recover_lock);
  410. if (rc->rc_seq_reply != seq) {
  411. log_debug(ls, "ignoring old reply %x from %d "
  412. "seq_reply %llx expect %llx",
  413. rc->rc_type, rc->rc_header.h_nodeid,
  414. (unsigned long long)rc->rc_seq_reply,
  415. (unsigned long long)seq);
  416. rv = 1;
  417. }
  418. }
  419. return rv;
  420. }
  421. /* Called by dlm_recv; corresponds to dlm_receive_message() but special
  422. recovery-only comms are sent through here. */
  423. void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
  424. {
  425. int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock);
  426. if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
  427. log_debug(ls, "ignoring recovery message %x from %d",
  428. rc->rc_type, nodeid);
  429. goto out;
  430. }
  431. if (is_old_reply(ls, rc))
  432. goto out;
  433. switch (rc->rc_type) {
  434. case DLM_RCOM_STATUS:
  435. receive_rcom_status(ls, rc);
  436. break;
  437. case DLM_RCOM_NAMES:
  438. receive_rcom_names(ls, rc);
  439. break;
  440. case DLM_RCOM_LOOKUP:
  441. receive_rcom_lookup(ls, rc);
  442. break;
  443. case DLM_RCOM_LOCK:
  444. if (rc->rc_header.h_length < lock_size)
  445. goto Eshort;
  446. receive_rcom_lock(ls, rc);
  447. break;
  448. case DLM_RCOM_STATUS_REPLY:
  449. receive_sync_reply(ls, rc);
  450. break;
  451. case DLM_RCOM_NAMES_REPLY:
  452. receive_sync_reply(ls, rc);
  453. break;
  454. case DLM_RCOM_LOOKUP_REPLY:
  455. receive_rcom_lookup_reply(ls, rc);
  456. break;
  457. case DLM_RCOM_LOCK_REPLY:
  458. if (rc->rc_header.h_length < lock_size)
  459. goto Eshort;
  460. dlm_recover_process_copy(ls, rc);
  461. break;
  462. default:
  463. log_error(ls, "receive_rcom bad type %d", rc->rc_type);
  464. }
  465. out:
  466. return;
  467. Eshort:
  468. log_error(ls, "recovery message %x from %d is too short",
  469. rc->rc_type, nodeid);
  470. }