raid5-cache.c 34 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268
  1. /*
  2. * Copyright (C) 2015 Shaohua Li <shli@fb.com>
  3. *
  4. * This program is free software; you can redistribute it and/or modify it
  5. * under the terms and conditions of the GNU General Public License,
  6. * version 2, as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope it will be useful, but WITHOUT
  9. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  11. * more details.
  12. *
  13. */
  14. #include <linux/kernel.h>
  15. #include <linux/wait.h>
  16. #include <linux/blkdev.h>
  17. #include <linux/slab.h>
  18. #include <linux/raid/md_p.h>
  19. #include <linux/crc32c.h>
  20. #include <linux/random.h>
  21. #include "md.h"
  22. #include "raid5.h"
  23. /*
  24. * metadata/data stored in disk with 4k size unit (a block) regardless
  25. * underneath hardware sector size. only works with PAGE_SIZE == 4096
  26. */
  27. #define BLOCK_SECTORS (8)
  28. /*
  29. * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
  30. * recovery scans a very long log
  31. */
  32. #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
  33. #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
  34. /*
  35. * We only need 2 bios per I/O unit to make progress, but ensure we
  36. * have a few more available to not get too tight.
  37. */
  38. #define R5L_POOL_SIZE 4
  39. struct r5l_log {
  40. struct md_rdev *rdev;
  41. u32 uuid_checksum;
  42. sector_t device_size; /* log device size, round to
  43. * BLOCK_SECTORS */
  44. sector_t max_free_space; /* reclaim run if free space is at
  45. * this size */
  46. sector_t last_checkpoint; /* log tail. where recovery scan
  47. * starts from */
  48. u64 last_cp_seq; /* log tail sequence */
  49. sector_t log_start; /* log head. where new data appends */
  50. u64 seq; /* log head sequence */
  51. sector_t next_checkpoint;
  52. u64 next_cp_seq;
  53. struct mutex io_mutex;
  54. struct r5l_io_unit *current_io; /* current io_unit accepting new data */
  55. spinlock_t io_list_lock;
  56. struct list_head running_ios; /* io_units which are still running,
  57. * and have not yet been completely
  58. * written to the log */
  59. struct list_head io_end_ios; /* io_units which have been completely
  60. * written to the log but not yet written
  61. * to the RAID */
  62. struct list_head flushing_ios; /* io_units which are waiting for log
  63. * cache flush */
  64. struct list_head finished_ios; /* io_units which settle down in log disk */
  65. struct bio flush_bio;
  66. struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */
  67. struct kmem_cache *io_kc;
  68. mempool_t *io_pool;
  69. struct bio_set *bs;
  70. mempool_t *meta_pool;
  71. struct md_thread *reclaim_thread;
  72. unsigned long reclaim_target; /* number of space that need to be
  73. * reclaimed. if it's 0, reclaim spaces
  74. * used by io_units which are in
  75. * IO_UNIT_STRIPE_END state (eg, reclaim
  76. * dones't wait for specific io_unit
  77. * switching to IO_UNIT_STRIPE_END
  78. * state) */
  79. wait_queue_head_t iounit_wait;
  80. struct list_head no_space_stripes; /* pending stripes, log has no space */
  81. spinlock_t no_space_stripes_lock;
  82. bool need_cache_flush;
  83. };
  84. /*
  85. * an IO range starts from a meta data block and end at the next meta data
  86. * block. The io unit's the meta data block tracks data/parity followed it. io
  87. * unit is written to log disk with normal write, as we always flush log disk
  88. * first and then start move data to raid disks, there is no requirement to
  89. * write io unit with FLUSH/FUA
  90. */
  91. struct r5l_io_unit {
  92. struct r5l_log *log;
  93. struct page *meta_page; /* store meta block */
  94. int meta_offset; /* current offset in meta_page */
  95. struct bio *current_bio;/* current_bio accepting new data */
  96. atomic_t pending_stripe;/* how many stripes not flushed to raid */
  97. u64 seq; /* seq number of the metablock */
  98. sector_t log_start; /* where the io_unit starts */
  99. sector_t log_end; /* where the io_unit ends */
  100. struct list_head log_sibling; /* log->running_ios */
  101. struct list_head stripe_list; /* stripes added to the io_unit */
  102. int state;
  103. bool need_split_bio;
  104. };
  105. /* r5l_io_unit state */
  106. enum r5l_io_unit_state {
  107. IO_UNIT_RUNNING = 0, /* accepting new IO */
  108. IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
  109. * don't accepting new bio */
  110. IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
  111. IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
  112. };
  113. static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
  114. {
  115. start += inc;
  116. if (start >= log->device_size)
  117. start = start - log->device_size;
  118. return start;
  119. }
  120. static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
  121. sector_t end)
  122. {
  123. if (end >= start)
  124. return end - start;
  125. else
  126. return end + log->device_size - start;
  127. }
  128. static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
  129. {
  130. sector_t used_size;
  131. used_size = r5l_ring_distance(log, log->last_checkpoint,
  132. log->log_start);
  133. return log->device_size > used_size + size;
  134. }
  135. static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
  136. enum r5l_io_unit_state state)
  137. {
  138. if (WARN_ON(io->state >= state))
  139. return;
  140. io->state = state;
  141. }
  142. static void r5l_io_run_stripes(struct r5l_io_unit *io)
  143. {
  144. struct stripe_head *sh, *next;
  145. list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
  146. list_del_init(&sh->log_list);
  147. set_bit(STRIPE_HANDLE, &sh->state);
  148. raid5_release_stripe(sh);
  149. }
  150. }
  151. static void r5l_log_run_stripes(struct r5l_log *log)
  152. {
  153. struct r5l_io_unit *io, *next;
  154. assert_spin_locked(&log->io_list_lock);
  155. list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
  156. /* don't change list order */
  157. if (io->state < IO_UNIT_IO_END)
  158. break;
  159. list_move_tail(&io->log_sibling, &log->finished_ios);
  160. r5l_io_run_stripes(io);
  161. }
  162. }
  163. static void r5l_move_to_end_ios(struct r5l_log *log)
  164. {
  165. struct r5l_io_unit *io, *next;
  166. assert_spin_locked(&log->io_list_lock);
  167. list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
  168. /* don't change list order */
  169. if (io->state < IO_UNIT_IO_END)
  170. break;
  171. list_move_tail(&io->log_sibling, &log->io_end_ios);
  172. }
  173. }
  174. static void r5l_log_endio(struct bio *bio)
  175. {
  176. struct r5l_io_unit *io = bio->bi_private;
  177. struct r5l_log *log = io->log;
  178. unsigned long flags;
  179. if (bio->bi_error)
  180. md_error(log->rdev->mddev, log->rdev);
  181. bio_put(bio);
  182. mempool_free(io->meta_page, log->meta_pool);
  183. spin_lock_irqsave(&log->io_list_lock, flags);
  184. __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
  185. if (log->need_cache_flush)
  186. r5l_move_to_end_ios(log);
  187. else
  188. r5l_log_run_stripes(log);
  189. spin_unlock_irqrestore(&log->io_list_lock, flags);
  190. if (log->need_cache_flush)
  191. md_wakeup_thread(log->rdev->mddev->thread);
  192. }
  193. static void r5l_submit_current_io(struct r5l_log *log)
  194. {
  195. struct r5l_io_unit *io = log->current_io;
  196. struct r5l_meta_block *block;
  197. unsigned long flags;
  198. u32 crc;
  199. if (!io)
  200. return;
  201. block = page_address(io->meta_page);
  202. block->meta_size = cpu_to_le32(io->meta_offset);
  203. crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
  204. block->checksum = cpu_to_le32(crc);
  205. log->current_io = NULL;
  206. spin_lock_irqsave(&log->io_list_lock, flags);
  207. __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
  208. spin_unlock_irqrestore(&log->io_list_lock, flags);
  209. submit_bio(io->current_bio);
  210. }
  211. static struct bio *r5l_bio_alloc(struct r5l_log *log)
  212. {
  213. struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
  214. bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
  215. bio->bi_bdev = log->rdev->bdev;
  216. bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
  217. return bio;
  218. }
  219. static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
  220. {
  221. log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
  222. /*
  223. * If we filled up the log device start from the beginning again,
  224. * which will require a new bio.
  225. *
  226. * Note: for this to work properly the log size needs to me a multiple
  227. * of BLOCK_SECTORS.
  228. */
  229. if (log->log_start == 0)
  230. io->need_split_bio = true;
  231. io->log_end = log->log_start;
  232. }
  233. static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
  234. {
  235. struct r5l_io_unit *io;
  236. struct r5l_meta_block *block;
  237. io = mempool_alloc(log->io_pool, GFP_ATOMIC);
  238. if (!io)
  239. return NULL;
  240. memset(io, 0, sizeof(*io));
  241. io->log = log;
  242. INIT_LIST_HEAD(&io->log_sibling);
  243. INIT_LIST_HEAD(&io->stripe_list);
  244. io->state = IO_UNIT_RUNNING;
  245. io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
  246. block = page_address(io->meta_page);
  247. clear_page(block);
  248. block->magic = cpu_to_le32(R5LOG_MAGIC);
  249. block->version = R5LOG_VERSION;
  250. block->seq = cpu_to_le64(log->seq);
  251. block->position = cpu_to_le64(log->log_start);
  252. io->log_start = log->log_start;
  253. io->meta_offset = sizeof(struct r5l_meta_block);
  254. io->seq = log->seq++;
  255. io->current_bio = r5l_bio_alloc(log);
  256. io->current_bio->bi_end_io = r5l_log_endio;
  257. io->current_bio->bi_private = io;
  258. bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
  259. r5_reserve_log_entry(log, io);
  260. spin_lock_irq(&log->io_list_lock);
  261. list_add_tail(&io->log_sibling, &log->running_ios);
  262. spin_unlock_irq(&log->io_list_lock);
  263. return io;
  264. }
  265. static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
  266. {
  267. if (log->current_io &&
  268. log->current_io->meta_offset + payload_size > PAGE_SIZE)
  269. r5l_submit_current_io(log);
  270. if (!log->current_io) {
  271. log->current_io = r5l_new_meta(log);
  272. if (!log->current_io)
  273. return -ENOMEM;
  274. }
  275. return 0;
  276. }
  277. static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
  278. sector_t location,
  279. u32 checksum1, u32 checksum2,
  280. bool checksum2_valid)
  281. {
  282. struct r5l_io_unit *io = log->current_io;
  283. struct r5l_payload_data_parity *payload;
  284. payload = page_address(io->meta_page) + io->meta_offset;
  285. payload->header.type = cpu_to_le16(type);
  286. payload->header.flags = cpu_to_le16(0);
  287. payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
  288. (PAGE_SHIFT - 9));
  289. payload->location = cpu_to_le64(location);
  290. payload->checksum[0] = cpu_to_le32(checksum1);
  291. if (checksum2_valid)
  292. payload->checksum[1] = cpu_to_le32(checksum2);
  293. io->meta_offset += sizeof(struct r5l_payload_data_parity) +
  294. sizeof(__le32) * (1 + !!checksum2_valid);
  295. }
  296. static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
  297. {
  298. struct r5l_io_unit *io = log->current_io;
  299. if (io->need_split_bio) {
  300. struct bio *prev = io->current_bio;
  301. io->current_bio = r5l_bio_alloc(log);
  302. bio_chain(io->current_bio, prev);
  303. submit_bio(prev);
  304. }
  305. if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
  306. BUG();
  307. r5_reserve_log_entry(log, io);
  308. }
  309. static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
  310. int data_pages, int parity_pages)
  311. {
  312. int i;
  313. int meta_size;
  314. int ret;
  315. struct r5l_io_unit *io;
  316. meta_size =
  317. ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
  318. * data_pages) +
  319. sizeof(struct r5l_payload_data_parity) +
  320. sizeof(__le32) * parity_pages;
  321. ret = r5l_get_meta(log, meta_size);
  322. if (ret)
  323. return ret;
  324. io = log->current_io;
  325. for (i = 0; i < sh->disks; i++) {
  326. if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
  327. continue;
  328. if (i == sh->pd_idx || i == sh->qd_idx)
  329. continue;
  330. r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
  331. raid5_compute_blocknr(sh, i, 0),
  332. sh->dev[i].log_checksum, 0, false);
  333. r5l_append_payload_page(log, sh->dev[i].page);
  334. }
  335. if (sh->qd_idx >= 0) {
  336. r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
  337. sh->sector, sh->dev[sh->pd_idx].log_checksum,
  338. sh->dev[sh->qd_idx].log_checksum, true);
  339. r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
  340. r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
  341. } else {
  342. r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
  343. sh->sector, sh->dev[sh->pd_idx].log_checksum,
  344. 0, false);
  345. r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
  346. }
  347. list_add_tail(&sh->log_list, &io->stripe_list);
  348. atomic_inc(&io->pending_stripe);
  349. sh->log_io = io;
  350. return 0;
  351. }
  352. static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
  353. /*
  354. * running in raid5d, where reclaim could wait for raid5d too (when it flushes
  355. * data from log to raid disks), so we shouldn't wait for reclaim here
  356. */
  357. int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
  358. {
  359. int write_disks = 0;
  360. int data_pages, parity_pages;
  361. int meta_size;
  362. int reserve;
  363. int i;
  364. int ret = 0;
  365. if (!log)
  366. return -EAGAIN;
  367. /* Don't support stripe batch */
  368. if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
  369. test_bit(STRIPE_SYNCING, &sh->state)) {
  370. /* the stripe is written to log, we start writing it to raid */
  371. clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
  372. return -EAGAIN;
  373. }
  374. for (i = 0; i < sh->disks; i++) {
  375. void *addr;
  376. if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
  377. continue;
  378. write_disks++;
  379. /* checksum is already calculated in last run */
  380. if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
  381. continue;
  382. addr = kmap_atomic(sh->dev[i].page);
  383. sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
  384. addr, PAGE_SIZE);
  385. kunmap_atomic(addr);
  386. }
  387. parity_pages = 1 + !!(sh->qd_idx >= 0);
  388. data_pages = write_disks - parity_pages;
  389. meta_size =
  390. ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
  391. * data_pages) +
  392. sizeof(struct r5l_payload_data_parity) +
  393. sizeof(__le32) * parity_pages;
  394. /* Doesn't work with very big raid array */
  395. if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
  396. return -EINVAL;
  397. set_bit(STRIPE_LOG_TRAPPED, &sh->state);
  398. /*
  399. * The stripe must enter state machine again to finish the write, so
  400. * don't delay.
  401. */
  402. clear_bit(STRIPE_DELAYED, &sh->state);
  403. atomic_inc(&sh->count);
  404. mutex_lock(&log->io_mutex);
  405. /* meta + data */
  406. reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
  407. if (!r5l_has_free_space(log, reserve)) {
  408. spin_lock(&log->no_space_stripes_lock);
  409. list_add_tail(&sh->log_list, &log->no_space_stripes);
  410. spin_unlock(&log->no_space_stripes_lock);
  411. r5l_wake_reclaim(log, reserve);
  412. } else {
  413. ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
  414. if (ret) {
  415. spin_lock_irq(&log->io_list_lock);
  416. list_add_tail(&sh->log_list, &log->no_mem_stripes);
  417. spin_unlock_irq(&log->io_list_lock);
  418. }
  419. }
  420. mutex_unlock(&log->io_mutex);
  421. return 0;
  422. }
  423. void r5l_write_stripe_run(struct r5l_log *log)
  424. {
  425. if (!log)
  426. return;
  427. mutex_lock(&log->io_mutex);
  428. r5l_submit_current_io(log);
  429. mutex_unlock(&log->io_mutex);
  430. }
  431. int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
  432. {
  433. if (!log)
  434. return -ENODEV;
  435. /*
  436. * we flush log disk cache first, then write stripe data to raid disks.
  437. * So if bio is finished, the log disk cache is flushed already. The
  438. * recovery guarantees we can recovery the bio from log disk, so we
  439. * don't need to flush again
  440. */
  441. if (bio->bi_iter.bi_size == 0) {
  442. bio_endio(bio);
  443. return 0;
  444. }
  445. bio->bi_opf &= ~REQ_PREFLUSH;
  446. return -EAGAIN;
  447. }
  448. /* This will run after log space is reclaimed */
  449. static void r5l_run_no_space_stripes(struct r5l_log *log)
  450. {
  451. struct stripe_head *sh;
  452. spin_lock(&log->no_space_stripes_lock);
  453. while (!list_empty(&log->no_space_stripes)) {
  454. sh = list_first_entry(&log->no_space_stripes,
  455. struct stripe_head, log_list);
  456. list_del_init(&sh->log_list);
  457. set_bit(STRIPE_HANDLE, &sh->state);
  458. raid5_release_stripe(sh);
  459. }
  460. spin_unlock(&log->no_space_stripes_lock);
  461. }
  462. static sector_t r5l_reclaimable_space(struct r5l_log *log)
  463. {
  464. return r5l_ring_distance(log, log->last_checkpoint,
  465. log->next_checkpoint);
  466. }
  467. static void r5l_run_no_mem_stripe(struct r5l_log *log)
  468. {
  469. struct stripe_head *sh;
  470. assert_spin_locked(&log->io_list_lock);
  471. if (!list_empty(&log->no_mem_stripes)) {
  472. sh = list_first_entry(&log->no_mem_stripes,
  473. struct stripe_head, log_list);
  474. list_del_init(&sh->log_list);
  475. set_bit(STRIPE_HANDLE, &sh->state);
  476. raid5_release_stripe(sh);
  477. }
  478. }
  479. static bool r5l_complete_finished_ios(struct r5l_log *log)
  480. {
  481. struct r5l_io_unit *io, *next;
  482. bool found = false;
  483. assert_spin_locked(&log->io_list_lock);
  484. list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
  485. /* don't change list order */
  486. if (io->state < IO_UNIT_STRIPE_END)
  487. break;
  488. log->next_checkpoint = io->log_start;
  489. log->next_cp_seq = io->seq;
  490. list_del(&io->log_sibling);
  491. mempool_free(io, log->io_pool);
  492. r5l_run_no_mem_stripe(log);
  493. found = true;
  494. }
  495. return found;
  496. }
  497. static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
  498. {
  499. struct r5l_log *log = io->log;
  500. unsigned long flags;
  501. spin_lock_irqsave(&log->io_list_lock, flags);
  502. __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
  503. if (!r5l_complete_finished_ios(log)) {
  504. spin_unlock_irqrestore(&log->io_list_lock, flags);
  505. return;
  506. }
  507. if (r5l_reclaimable_space(log) > log->max_free_space)
  508. r5l_wake_reclaim(log, 0);
  509. spin_unlock_irqrestore(&log->io_list_lock, flags);
  510. wake_up(&log->iounit_wait);
  511. }
  512. void r5l_stripe_write_finished(struct stripe_head *sh)
  513. {
  514. struct r5l_io_unit *io;
  515. io = sh->log_io;
  516. sh->log_io = NULL;
  517. if (io && atomic_dec_and_test(&io->pending_stripe))
  518. __r5l_stripe_write_finished(io);
  519. }
  520. static void r5l_log_flush_endio(struct bio *bio)
  521. {
  522. struct r5l_log *log = container_of(bio, struct r5l_log,
  523. flush_bio);
  524. unsigned long flags;
  525. struct r5l_io_unit *io;
  526. if (bio->bi_error)
  527. md_error(log->rdev->mddev, log->rdev);
  528. spin_lock_irqsave(&log->io_list_lock, flags);
  529. list_for_each_entry(io, &log->flushing_ios, log_sibling)
  530. r5l_io_run_stripes(io);
  531. list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
  532. spin_unlock_irqrestore(&log->io_list_lock, flags);
  533. }
  534. /*
  535. * Starting dispatch IO to raid.
  536. * io_unit(meta) consists of a log. There is one situation we want to avoid. A
  537. * broken meta in the middle of a log causes recovery can't find meta at the
  538. * head of log. If operations require meta at the head persistent in log, we
  539. * must make sure meta before it persistent in log too. A case is:
  540. *
  541. * stripe data/parity is in log, we start write stripe to raid disks. stripe
  542. * data/parity must be persistent in log before we do the write to raid disks.
  543. *
  544. * The solution is we restrictly maintain io_unit list order. In this case, we
  545. * only write stripes of an io_unit to raid disks till the io_unit is the first
  546. * one whose data/parity is in log.
  547. */
  548. void r5l_flush_stripe_to_raid(struct r5l_log *log)
  549. {
  550. bool do_flush;
  551. if (!log || !log->need_cache_flush)
  552. return;
  553. spin_lock_irq(&log->io_list_lock);
  554. /* flush bio is running */
  555. if (!list_empty(&log->flushing_ios)) {
  556. spin_unlock_irq(&log->io_list_lock);
  557. return;
  558. }
  559. list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
  560. do_flush = !list_empty(&log->flushing_ios);
  561. spin_unlock_irq(&log->io_list_lock);
  562. if (!do_flush)
  563. return;
  564. bio_reset(&log->flush_bio);
  565. log->flush_bio.bi_bdev = log->rdev->bdev;
  566. log->flush_bio.bi_end_io = r5l_log_flush_endio;
  567. bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH);
  568. submit_bio(&log->flush_bio);
  569. }
  570. static void r5l_write_super(struct r5l_log *log, sector_t cp);
  571. static void r5l_write_super_and_discard_space(struct r5l_log *log,
  572. sector_t end)
  573. {
  574. struct block_device *bdev = log->rdev->bdev;
  575. struct mddev *mddev;
  576. r5l_write_super(log, end);
  577. if (!blk_queue_discard(bdev_get_queue(bdev)))
  578. return;
  579. mddev = log->rdev->mddev;
  580. /*
  581. * Discard could zero data, so before discard we must make sure
  582. * superblock is updated to new log tail. Updating superblock (either
  583. * directly call md_update_sb() or depend on md thread) must hold
  584. * reconfig mutex. On the other hand, raid5_quiesce is called with
  585. * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
  586. * for all IO finish, hence waitting for reclaim thread, while reclaim
  587. * thread is calling this function and waitting for reconfig mutex. So
  588. * there is a deadlock. We workaround this issue with a trylock.
  589. * FIXME: we could miss discard if we can't take reconfig mutex
  590. */
  591. set_mask_bits(&mddev->flags, 0,
  592. BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
  593. if (!mddev_trylock(mddev))
  594. return;
  595. md_update_sb(mddev, 1);
  596. mddev_unlock(mddev);
  597. /* discard IO error really doesn't matter, ignore it */
  598. if (log->last_checkpoint < end) {
  599. blkdev_issue_discard(bdev,
  600. log->last_checkpoint + log->rdev->data_offset,
  601. end - log->last_checkpoint, GFP_NOIO, 0);
  602. } else {
  603. blkdev_issue_discard(bdev,
  604. log->last_checkpoint + log->rdev->data_offset,
  605. log->device_size - log->last_checkpoint,
  606. GFP_NOIO, 0);
  607. blkdev_issue_discard(bdev, log->rdev->data_offset, end,
  608. GFP_NOIO, 0);
  609. }
  610. }
  611. static void r5l_do_reclaim(struct r5l_log *log)
  612. {
  613. sector_t reclaim_target = xchg(&log->reclaim_target, 0);
  614. sector_t reclaimable;
  615. sector_t next_checkpoint;
  616. u64 next_cp_seq;
  617. spin_lock_irq(&log->io_list_lock);
  618. /*
  619. * move proper io_unit to reclaim list. We should not change the order.
  620. * reclaimable/unreclaimable io_unit can be mixed in the list, we
  621. * shouldn't reuse space of an unreclaimable io_unit
  622. */
  623. while (1) {
  624. reclaimable = r5l_reclaimable_space(log);
  625. if (reclaimable >= reclaim_target ||
  626. (list_empty(&log->running_ios) &&
  627. list_empty(&log->io_end_ios) &&
  628. list_empty(&log->flushing_ios) &&
  629. list_empty(&log->finished_ios)))
  630. break;
  631. md_wakeup_thread(log->rdev->mddev->thread);
  632. wait_event_lock_irq(log->iounit_wait,
  633. r5l_reclaimable_space(log) > reclaimable,
  634. log->io_list_lock);
  635. }
  636. next_checkpoint = log->next_checkpoint;
  637. next_cp_seq = log->next_cp_seq;
  638. spin_unlock_irq(&log->io_list_lock);
  639. BUG_ON(reclaimable < 0);
  640. if (reclaimable == 0)
  641. return;
  642. /*
  643. * write_super will flush cache of each raid disk. We must write super
  644. * here, because the log area might be reused soon and we don't want to
  645. * confuse recovery
  646. */
  647. r5l_write_super_and_discard_space(log, next_checkpoint);
  648. mutex_lock(&log->io_mutex);
  649. log->last_checkpoint = next_checkpoint;
  650. log->last_cp_seq = next_cp_seq;
  651. mutex_unlock(&log->io_mutex);
  652. r5l_run_no_space_stripes(log);
  653. }
  654. static void r5l_reclaim_thread(struct md_thread *thread)
  655. {
  656. struct mddev *mddev = thread->mddev;
  657. struct r5conf *conf = mddev->private;
  658. struct r5l_log *log = conf->log;
  659. if (!log)
  660. return;
  661. r5l_do_reclaim(log);
  662. }
  663. static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
  664. {
  665. unsigned long target;
  666. unsigned long new = (unsigned long)space; /* overflow in theory */
  667. do {
  668. target = log->reclaim_target;
  669. if (new < target)
  670. return;
  671. } while (cmpxchg(&log->reclaim_target, target, new) != target);
  672. md_wakeup_thread(log->reclaim_thread);
  673. }
  674. void r5l_quiesce(struct r5l_log *log, int state)
  675. {
  676. struct mddev *mddev;
  677. if (!log || state == 2)
  678. return;
  679. if (state == 0) {
  680. /*
  681. * This is a special case for hotadd. In suspend, the array has
  682. * no journal. In resume, journal is initialized as well as the
  683. * reclaim thread.
  684. */
  685. if (log->reclaim_thread)
  686. return;
  687. log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
  688. log->rdev->mddev, "reclaim");
  689. } else if (state == 1) {
  690. /* make sure r5l_write_super_and_discard_space exits */
  691. mddev = log->rdev->mddev;
  692. wake_up(&mddev->sb_wait);
  693. r5l_wake_reclaim(log, -1L);
  694. md_unregister_thread(&log->reclaim_thread);
  695. r5l_do_reclaim(log);
  696. }
  697. }
  698. bool r5l_log_disk_error(struct r5conf *conf)
  699. {
  700. struct r5l_log *log;
  701. bool ret;
  702. /* don't allow write if journal disk is missing */
  703. rcu_read_lock();
  704. log = rcu_dereference(conf->log);
  705. if (!log)
  706. ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
  707. else
  708. ret = test_bit(Faulty, &log->rdev->flags);
  709. rcu_read_unlock();
  710. return ret;
  711. }
  712. struct r5l_recovery_ctx {
  713. struct page *meta_page; /* current meta */
  714. sector_t meta_total_blocks; /* total size of current meta and data */
  715. sector_t pos; /* recovery position */
  716. u64 seq; /* recovery position seq */
  717. };
  718. static int r5l_read_meta_block(struct r5l_log *log,
  719. struct r5l_recovery_ctx *ctx)
  720. {
  721. struct page *page = ctx->meta_page;
  722. struct r5l_meta_block *mb;
  723. u32 crc, stored_crc;
  724. if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
  725. false))
  726. return -EIO;
  727. mb = page_address(page);
  728. stored_crc = le32_to_cpu(mb->checksum);
  729. mb->checksum = 0;
  730. if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
  731. le64_to_cpu(mb->seq) != ctx->seq ||
  732. mb->version != R5LOG_VERSION ||
  733. le64_to_cpu(mb->position) != ctx->pos)
  734. return -EINVAL;
  735. crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
  736. if (stored_crc != crc)
  737. return -EINVAL;
  738. if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
  739. return -EINVAL;
  740. ctx->meta_total_blocks = BLOCK_SECTORS;
  741. return 0;
  742. }
  743. static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
  744. struct r5l_recovery_ctx *ctx,
  745. sector_t stripe_sect,
  746. int *offset, sector_t *log_offset)
  747. {
  748. struct r5conf *conf = log->rdev->mddev->private;
  749. struct stripe_head *sh;
  750. struct r5l_payload_data_parity *payload;
  751. int disk_index;
  752. sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
  753. while (1) {
  754. payload = page_address(ctx->meta_page) + *offset;
  755. if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
  756. raid5_compute_sector(conf,
  757. le64_to_cpu(payload->location), 0,
  758. &disk_index, sh);
  759. sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
  760. sh->dev[disk_index].page, REQ_OP_READ, 0,
  761. false);
  762. sh->dev[disk_index].log_checksum =
  763. le32_to_cpu(payload->checksum[0]);
  764. set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
  765. ctx->meta_total_blocks += BLOCK_SECTORS;
  766. } else {
  767. disk_index = sh->pd_idx;
  768. sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
  769. sh->dev[disk_index].page, REQ_OP_READ, 0,
  770. false);
  771. sh->dev[disk_index].log_checksum =
  772. le32_to_cpu(payload->checksum[0]);
  773. set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
  774. if (sh->qd_idx >= 0) {
  775. disk_index = sh->qd_idx;
  776. sync_page_io(log->rdev,
  777. r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
  778. PAGE_SIZE, sh->dev[disk_index].page,
  779. REQ_OP_READ, 0, false);
  780. sh->dev[disk_index].log_checksum =
  781. le32_to_cpu(payload->checksum[1]);
  782. set_bit(R5_Wantwrite,
  783. &sh->dev[disk_index].flags);
  784. }
  785. ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
  786. }
  787. *log_offset = r5l_ring_add(log, *log_offset,
  788. le32_to_cpu(payload->size));
  789. *offset += sizeof(struct r5l_payload_data_parity) +
  790. sizeof(__le32) *
  791. (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
  792. if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
  793. break;
  794. }
  795. for (disk_index = 0; disk_index < sh->disks; disk_index++) {
  796. void *addr;
  797. u32 checksum;
  798. if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
  799. continue;
  800. addr = kmap_atomic(sh->dev[disk_index].page);
  801. checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
  802. kunmap_atomic(addr);
  803. if (checksum != sh->dev[disk_index].log_checksum)
  804. goto error;
  805. }
  806. for (disk_index = 0; disk_index < sh->disks; disk_index++) {
  807. struct md_rdev *rdev, *rrdev;
  808. if (!test_and_clear_bit(R5_Wantwrite,
  809. &sh->dev[disk_index].flags))
  810. continue;
  811. /* in case device is broken */
  812. rdev = rcu_dereference(conf->disks[disk_index].rdev);
  813. if (rdev)
  814. sync_page_io(rdev, stripe_sect, PAGE_SIZE,
  815. sh->dev[disk_index].page, REQ_OP_WRITE, 0,
  816. false);
  817. rrdev = rcu_dereference(conf->disks[disk_index].replacement);
  818. if (rrdev)
  819. sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
  820. sh->dev[disk_index].page, REQ_OP_WRITE, 0,
  821. false);
  822. }
  823. raid5_release_stripe(sh);
  824. return 0;
  825. error:
  826. for (disk_index = 0; disk_index < sh->disks; disk_index++)
  827. sh->dev[disk_index].flags = 0;
  828. raid5_release_stripe(sh);
  829. return -EINVAL;
  830. }
  831. static int r5l_recovery_flush_one_meta(struct r5l_log *log,
  832. struct r5l_recovery_ctx *ctx)
  833. {
  834. struct r5conf *conf = log->rdev->mddev->private;
  835. struct r5l_payload_data_parity *payload;
  836. struct r5l_meta_block *mb;
  837. int offset;
  838. sector_t log_offset;
  839. sector_t stripe_sector;
  840. mb = page_address(ctx->meta_page);
  841. offset = sizeof(struct r5l_meta_block);
  842. log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
  843. while (offset < le32_to_cpu(mb->meta_size)) {
  844. int dd;
  845. payload = (void *)mb + offset;
  846. stripe_sector = raid5_compute_sector(conf,
  847. le64_to_cpu(payload->location), 0, &dd, NULL);
  848. if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
  849. &offset, &log_offset))
  850. return -EINVAL;
  851. }
  852. return 0;
  853. }
  854. /* copy data/parity from log to raid disks */
  855. static void r5l_recovery_flush_log(struct r5l_log *log,
  856. struct r5l_recovery_ctx *ctx)
  857. {
  858. while (1) {
  859. if (r5l_read_meta_block(log, ctx))
  860. return;
  861. if (r5l_recovery_flush_one_meta(log, ctx))
  862. return;
  863. ctx->seq++;
  864. ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
  865. }
  866. }
  867. static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
  868. u64 seq)
  869. {
  870. struct page *page;
  871. struct r5l_meta_block *mb;
  872. u32 crc;
  873. page = alloc_page(GFP_KERNEL | __GFP_ZERO);
  874. if (!page)
  875. return -ENOMEM;
  876. mb = page_address(page);
  877. mb->magic = cpu_to_le32(R5LOG_MAGIC);
  878. mb->version = R5LOG_VERSION;
  879. mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
  880. mb->seq = cpu_to_le64(seq);
  881. mb->position = cpu_to_le64(pos);
  882. crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
  883. mb->checksum = cpu_to_le32(crc);
  884. if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
  885. WRITE_FUA, false)) {
  886. __free_page(page);
  887. return -EIO;
  888. }
  889. __free_page(page);
  890. return 0;
  891. }
  892. static int r5l_recovery_log(struct r5l_log *log)
  893. {
  894. struct r5l_recovery_ctx ctx;
  895. ctx.pos = log->last_checkpoint;
  896. ctx.seq = log->last_cp_seq;
  897. ctx.meta_page = alloc_page(GFP_KERNEL);
  898. if (!ctx.meta_page)
  899. return -ENOMEM;
  900. r5l_recovery_flush_log(log, &ctx);
  901. __free_page(ctx.meta_page);
  902. /*
  903. * we did a recovery. Now ctx.pos points to an invalid meta block. New
  904. * log will start here. but we can't let superblock point to last valid
  905. * meta block. The log might looks like:
  906. * | meta 1| meta 2| meta 3|
  907. * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
  908. * superblock points to meta 1, we write a new valid meta 2n. if crash
  909. * happens again, new recovery will start from meta 1. Since meta 2n is
  910. * valid now, recovery will think meta 3 is valid, which is wrong.
  911. * The solution is we create a new meta in meta2 with its seq == meta
  912. * 1's seq + 10 and let superblock points to meta2. The same recovery will
  913. * not think meta 3 is a valid meta, because its seq doesn't match
  914. */
  915. if (ctx.seq > log->last_cp_seq) {
  916. int ret;
  917. ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
  918. if (ret)
  919. return ret;
  920. log->seq = ctx.seq + 11;
  921. log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
  922. r5l_write_super(log, ctx.pos);
  923. log->last_checkpoint = ctx.pos;
  924. log->next_checkpoint = ctx.pos;
  925. } else {
  926. log->log_start = ctx.pos;
  927. log->seq = ctx.seq;
  928. }
  929. return 0;
  930. }
  931. static void r5l_write_super(struct r5l_log *log, sector_t cp)
  932. {
  933. struct mddev *mddev = log->rdev->mddev;
  934. log->rdev->journal_tail = cp;
  935. set_bit(MD_CHANGE_DEVS, &mddev->flags);
  936. }
  937. static int r5l_load_log(struct r5l_log *log)
  938. {
  939. struct md_rdev *rdev = log->rdev;
  940. struct page *page;
  941. struct r5l_meta_block *mb;
  942. sector_t cp = log->rdev->journal_tail;
  943. u32 stored_crc, expected_crc;
  944. bool create_super = false;
  945. int ret;
  946. /* Make sure it's valid */
  947. if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
  948. cp = 0;
  949. page = alloc_page(GFP_KERNEL);
  950. if (!page)
  951. return -ENOMEM;
  952. if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
  953. ret = -EIO;
  954. goto ioerr;
  955. }
  956. mb = page_address(page);
  957. if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
  958. mb->version != R5LOG_VERSION) {
  959. create_super = true;
  960. goto create;
  961. }
  962. stored_crc = le32_to_cpu(mb->checksum);
  963. mb->checksum = 0;
  964. expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
  965. if (stored_crc != expected_crc) {
  966. create_super = true;
  967. goto create;
  968. }
  969. if (le64_to_cpu(mb->position) != cp) {
  970. create_super = true;
  971. goto create;
  972. }
  973. create:
  974. if (create_super) {
  975. log->last_cp_seq = prandom_u32();
  976. cp = 0;
  977. r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
  978. /*
  979. * Make sure super points to correct address. Log might have
  980. * data very soon. If super hasn't correct log tail address,
  981. * recovery can't find the log
  982. */
  983. r5l_write_super(log, cp);
  984. } else
  985. log->last_cp_seq = le64_to_cpu(mb->seq);
  986. log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
  987. log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
  988. if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
  989. log->max_free_space = RECLAIM_MAX_FREE_SPACE;
  990. log->last_checkpoint = cp;
  991. log->next_checkpoint = cp;
  992. __free_page(page);
  993. return r5l_recovery_log(log);
  994. ioerr:
  995. __free_page(page);
  996. return ret;
  997. }
  998. int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
  999. {
  1000. struct request_queue *q = bdev_get_queue(rdev->bdev);
  1001. struct r5l_log *log;
  1002. if (PAGE_SIZE != 4096)
  1003. return -EINVAL;
  1004. log = kzalloc(sizeof(*log), GFP_KERNEL);
  1005. if (!log)
  1006. return -ENOMEM;
  1007. log->rdev = rdev;
  1008. log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
  1009. log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
  1010. sizeof(rdev->mddev->uuid));
  1011. mutex_init(&log->io_mutex);
  1012. spin_lock_init(&log->io_list_lock);
  1013. INIT_LIST_HEAD(&log->running_ios);
  1014. INIT_LIST_HEAD(&log->io_end_ios);
  1015. INIT_LIST_HEAD(&log->flushing_ios);
  1016. INIT_LIST_HEAD(&log->finished_ios);
  1017. bio_init(&log->flush_bio);
  1018. log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
  1019. if (!log->io_kc)
  1020. goto io_kc;
  1021. log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
  1022. if (!log->io_pool)
  1023. goto io_pool;
  1024. log->bs = bioset_create(R5L_POOL_SIZE, 0);
  1025. if (!log->bs)
  1026. goto io_bs;
  1027. log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
  1028. if (!log->meta_pool)
  1029. goto out_mempool;
  1030. log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
  1031. log->rdev->mddev, "reclaim");
  1032. if (!log->reclaim_thread)
  1033. goto reclaim_thread;
  1034. init_waitqueue_head(&log->iounit_wait);
  1035. INIT_LIST_HEAD(&log->no_mem_stripes);
  1036. INIT_LIST_HEAD(&log->no_space_stripes);
  1037. spin_lock_init(&log->no_space_stripes_lock);
  1038. if (r5l_load_log(log))
  1039. goto error;
  1040. rcu_assign_pointer(conf->log, log);
  1041. set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
  1042. return 0;
  1043. error:
  1044. md_unregister_thread(&log->reclaim_thread);
  1045. reclaim_thread:
  1046. mempool_destroy(log->meta_pool);
  1047. out_mempool:
  1048. bioset_free(log->bs);
  1049. io_bs:
  1050. mempool_destroy(log->io_pool);
  1051. io_pool:
  1052. kmem_cache_destroy(log->io_kc);
  1053. io_kc:
  1054. kfree(log);
  1055. return -EINVAL;
  1056. }
  1057. void r5l_exit_log(struct r5l_log *log)
  1058. {
  1059. md_unregister_thread(&log->reclaim_thread);
  1060. mempool_destroy(log->meta_pool);
  1061. bioset_free(log->bs);
  1062. mempool_destroy(log->io_pool);
  1063. kmem_cache_destroy(log->io_kc);
  1064. kfree(log);
  1065. }