kyber-iosched.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848
  1. /*
  2. * The Kyber I/O scheduler. Controls latency by throttling queue depths using
  3. * scalable techniques.
  4. *
  5. * Copyright (C) 2017 Facebook
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU General Public
  9. * License v2 as published by the Free Software Foundation.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program. If not, see <https://www.gnu.org/licenses/>.
  18. */
  19. #include <linux/kernel.h>
  20. #include <linux/blkdev.h>
  21. #include <linux/blk-mq.h>
  22. #include <linux/elevator.h>
  23. #include <linux/module.h>
  24. #include <linux/sbitmap.h>
  25. #include "blk.h"
  26. #include "blk-mq.h"
  27. #include "blk-mq-debugfs.h"
  28. #include "blk-mq-sched.h"
  29. #include "blk-mq-tag.h"
  30. #include "blk-stat.h"
  31. /* Scheduling domains. */
  32. enum {
  33. KYBER_READ,
  34. KYBER_SYNC_WRITE,
  35. KYBER_OTHER, /* Async writes, discard, etc. */
  36. KYBER_NUM_DOMAINS,
  37. };
  38. enum {
  39. KYBER_MIN_DEPTH = 256,
  40. /*
  41. * In order to prevent starvation of synchronous requests by a flood of
  42. * asynchronous requests, we reserve 25% of requests for synchronous
  43. * operations.
  44. */
  45. KYBER_ASYNC_PERCENT = 75,
  46. };
  47. /*
  48. * Initial device-wide depths for each scheduling domain.
  49. *
  50. * Even for fast devices with lots of tags like NVMe, you can saturate
  51. * the device with only a fraction of the maximum possible queue depth.
  52. * So, we cap these to a reasonable value.
  53. */
  54. static const unsigned int kyber_depth[] = {
  55. [KYBER_READ] = 256,
  56. [KYBER_SYNC_WRITE] = 128,
  57. [KYBER_OTHER] = 64,
  58. };
  59. /*
  60. * Scheduling domain batch sizes. We favor reads.
  61. */
  62. static const unsigned int kyber_batch_size[] = {
  63. [KYBER_READ] = 16,
  64. [KYBER_SYNC_WRITE] = 8,
  65. [KYBER_OTHER] = 8,
  66. };
  67. struct kyber_queue_data {
  68. struct request_queue *q;
  69. struct blk_stat_callback *cb;
  70. /*
  71. * The device is divided into multiple scheduling domains based on the
  72. * request type. Each domain has a fixed number of in-flight requests of
  73. * that type device-wide, limited by these tokens.
  74. */
  75. struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
  76. /*
  77. * Async request percentage, converted to per-word depth for
  78. * sbitmap_get_shallow().
  79. */
  80. unsigned int async_depth;
  81. /* Target latencies in nanoseconds. */
  82. u64 read_lat_nsec, write_lat_nsec;
  83. };
  84. struct kyber_hctx_data {
  85. spinlock_t lock;
  86. struct list_head rqs[KYBER_NUM_DOMAINS];
  87. unsigned int cur_domain;
  88. unsigned int batching;
  89. wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
  90. atomic_t wait_index[KYBER_NUM_DOMAINS];
  91. };
  92. static int rq_sched_domain(const struct request *rq)
  93. {
  94. unsigned int op = rq->cmd_flags;
  95. if ((op & REQ_OP_MASK) == REQ_OP_READ)
  96. return KYBER_READ;
  97. else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
  98. return KYBER_SYNC_WRITE;
  99. else
  100. return KYBER_OTHER;
  101. }
  102. enum {
  103. NONE = 0,
  104. GOOD = 1,
  105. GREAT = 2,
  106. BAD = -1,
  107. AWFUL = -2,
  108. };
  109. #define IS_GOOD(status) ((status) > 0)
  110. #define IS_BAD(status) ((status) < 0)
  111. static int kyber_lat_status(struct blk_stat_callback *cb,
  112. unsigned int sched_domain, u64 target)
  113. {
  114. u64 latency;
  115. if (!cb->stat[sched_domain].nr_samples)
  116. return NONE;
  117. latency = cb->stat[sched_domain].mean;
  118. if (latency >= 2 * target)
  119. return AWFUL;
  120. else if (latency > target)
  121. return BAD;
  122. else if (latency <= target / 2)
  123. return GREAT;
  124. else /* (latency <= target) */
  125. return GOOD;
  126. }
  127. /*
  128. * Adjust the read or synchronous write depth given the status of reads and
  129. * writes. The goal is that the latencies of the two domains are fair (i.e., if
  130. * one is good, then the other is good).
  131. */
  132. static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
  133. unsigned int sched_domain, int this_status,
  134. int other_status)
  135. {
  136. unsigned int orig_depth, depth;
  137. /*
  138. * If this domain had no samples, or reads and writes are both good or
  139. * both bad, don't adjust the depth.
  140. */
  141. if (this_status == NONE ||
  142. (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
  143. (IS_BAD(this_status) && IS_BAD(other_status)))
  144. return;
  145. orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
  146. if (other_status == NONE) {
  147. depth++;
  148. } else {
  149. switch (this_status) {
  150. case GOOD:
  151. if (other_status == AWFUL)
  152. depth -= max(depth / 4, 1U);
  153. else
  154. depth -= max(depth / 8, 1U);
  155. break;
  156. case GREAT:
  157. if (other_status == AWFUL)
  158. depth /= 2;
  159. else
  160. depth -= max(depth / 4, 1U);
  161. break;
  162. case BAD:
  163. depth++;
  164. break;
  165. case AWFUL:
  166. if (other_status == GREAT)
  167. depth += 2;
  168. else
  169. depth++;
  170. break;
  171. }
  172. }
  173. depth = clamp(depth, 1U, kyber_depth[sched_domain]);
  174. if (depth != orig_depth)
  175. sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
  176. }
  177. /*
  178. * Adjust the depth of other requests given the status of reads and synchronous
  179. * writes. As long as either domain is doing fine, we don't throttle, but if
  180. * both domains are doing badly, we throttle heavily.
  181. */
  182. static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
  183. int read_status, int write_status,
  184. bool have_samples)
  185. {
  186. unsigned int orig_depth, depth;
  187. int status;
  188. orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
  189. if (read_status == NONE && write_status == NONE) {
  190. depth += 2;
  191. } else if (have_samples) {
  192. if (read_status == NONE)
  193. status = write_status;
  194. else if (write_status == NONE)
  195. status = read_status;
  196. else
  197. status = max(read_status, write_status);
  198. switch (status) {
  199. case GREAT:
  200. depth += 2;
  201. break;
  202. case GOOD:
  203. depth++;
  204. break;
  205. case BAD:
  206. depth -= max(depth / 4, 1U);
  207. break;
  208. case AWFUL:
  209. depth /= 2;
  210. break;
  211. }
  212. }
  213. depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
  214. if (depth != orig_depth)
  215. sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
  216. }
  217. /*
  218. * Apply heuristics for limiting queue depths based on gathered latency
  219. * statistics.
  220. */
  221. static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
  222. {
  223. struct kyber_queue_data *kqd = cb->data;
  224. int read_status, write_status;
  225. read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
  226. write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
  227. kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
  228. kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
  229. kyber_adjust_other_depth(kqd, read_status, write_status,
  230. cb->stat[KYBER_OTHER].nr_samples != 0);
  231. /*
  232. * Continue monitoring latencies if we aren't hitting the targets or
  233. * we're still throttling other requests.
  234. */
  235. if (!blk_stat_is_active(kqd->cb) &&
  236. ((IS_BAD(read_status) || IS_BAD(write_status) ||
  237. kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
  238. blk_stat_activate_msecs(kqd->cb, 100);
  239. }
  240. static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
  241. {
  242. /*
  243. * All of the hardware queues have the same depth, so we can just grab
  244. * the shift of the first one.
  245. */
  246. return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
  247. }
  248. static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
  249. {
  250. struct kyber_queue_data *kqd;
  251. unsigned int max_tokens;
  252. unsigned int shift;
  253. int ret = -ENOMEM;
  254. int i;
  255. kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
  256. if (!kqd)
  257. goto err;
  258. kqd->q = q;
  259. kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
  260. KYBER_NUM_DOMAINS, kqd);
  261. if (!kqd->cb)
  262. goto err_kqd;
  263. /*
  264. * The maximum number of tokens for any scheduling domain is at least
  265. * the queue depth of a single hardware queue. If the hardware doesn't
  266. * have many tags, still provide a reasonable number.
  267. */
  268. max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
  269. KYBER_MIN_DEPTH);
  270. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  271. WARN_ON(!kyber_depth[i]);
  272. WARN_ON(!kyber_batch_size[i]);
  273. ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
  274. max_tokens, -1, false, GFP_KERNEL,
  275. q->node);
  276. if (ret) {
  277. while (--i >= 0)
  278. sbitmap_queue_free(&kqd->domain_tokens[i]);
  279. goto err_cb;
  280. }
  281. sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
  282. }
  283. shift = kyber_sched_tags_shift(kqd);
  284. kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
  285. kqd->read_lat_nsec = 2000000ULL;
  286. kqd->write_lat_nsec = 10000000ULL;
  287. return kqd;
  288. err_cb:
  289. blk_stat_free_callback(kqd->cb);
  290. err_kqd:
  291. kfree(kqd);
  292. err:
  293. return ERR_PTR(ret);
  294. }
  295. static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
  296. {
  297. struct kyber_queue_data *kqd;
  298. struct elevator_queue *eq;
  299. eq = elevator_alloc(q, e);
  300. if (!eq)
  301. return -ENOMEM;
  302. kqd = kyber_queue_data_alloc(q);
  303. if (IS_ERR(kqd)) {
  304. kobject_put(&eq->kobj);
  305. return PTR_ERR(kqd);
  306. }
  307. eq->elevator_data = kqd;
  308. q->elevator = eq;
  309. blk_stat_add_callback(q, kqd->cb);
  310. return 0;
  311. }
  312. static void kyber_exit_sched(struct elevator_queue *e)
  313. {
  314. struct kyber_queue_data *kqd = e->elevator_data;
  315. struct request_queue *q = kqd->q;
  316. int i;
  317. blk_stat_remove_callback(q, kqd->cb);
  318. for (i = 0; i < KYBER_NUM_DOMAINS; i++)
  319. sbitmap_queue_free(&kqd->domain_tokens[i]);
  320. blk_stat_free_callback(kqd->cb);
  321. kfree(kqd);
  322. }
  323. static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
  324. {
  325. struct kyber_hctx_data *khd;
  326. int i;
  327. khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
  328. if (!khd)
  329. return -ENOMEM;
  330. spin_lock_init(&khd->lock);
  331. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  332. INIT_LIST_HEAD(&khd->rqs[i]);
  333. INIT_LIST_HEAD(&khd->domain_wait[i].entry);
  334. atomic_set(&khd->wait_index[i], 0);
  335. }
  336. khd->cur_domain = 0;
  337. khd->batching = 0;
  338. hctx->sched_data = khd;
  339. return 0;
  340. }
  341. static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
  342. {
  343. kfree(hctx->sched_data);
  344. }
  345. static int rq_get_domain_token(struct request *rq)
  346. {
  347. return (long)rq->elv.priv[0];
  348. }
  349. static void rq_set_domain_token(struct request *rq, int token)
  350. {
  351. rq->elv.priv[0] = (void *)(long)token;
  352. }
  353. static void rq_clear_domain_token(struct kyber_queue_data *kqd,
  354. struct request *rq)
  355. {
  356. unsigned int sched_domain;
  357. int nr;
  358. nr = rq_get_domain_token(rq);
  359. if (nr != -1) {
  360. sched_domain = rq_sched_domain(rq);
  361. sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
  362. rq->mq_ctx->cpu);
  363. }
  364. }
  365. static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
  366. {
  367. /*
  368. * We use the scheduler tags as per-hardware queue queueing tokens.
  369. * Async requests can be limited at this stage.
  370. */
  371. if (!op_is_sync(op)) {
  372. struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
  373. data->shallow_depth = kqd->async_depth;
  374. }
  375. }
  376. static void kyber_prepare_request(struct request *rq, struct bio *bio)
  377. {
  378. rq_set_domain_token(rq, -1);
  379. }
  380. static void kyber_finish_request(struct request *rq)
  381. {
  382. struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
  383. rq_clear_domain_token(kqd, rq);
  384. }
  385. static void kyber_completed_request(struct request *rq)
  386. {
  387. struct request_queue *q = rq->q;
  388. struct kyber_queue_data *kqd = q->elevator->elevator_data;
  389. unsigned int sched_domain;
  390. u64 now, latency, target;
  391. /*
  392. * Check if this request met our latency goal. If not, quickly gather
  393. * some statistics and start throttling.
  394. */
  395. sched_domain = rq_sched_domain(rq);
  396. switch (sched_domain) {
  397. case KYBER_READ:
  398. target = kqd->read_lat_nsec;
  399. break;
  400. case KYBER_SYNC_WRITE:
  401. target = kqd->write_lat_nsec;
  402. break;
  403. default:
  404. return;
  405. }
  406. /* If we are already monitoring latencies, don't check again. */
  407. if (blk_stat_is_active(kqd->cb))
  408. return;
  409. now = __blk_stat_time(ktime_to_ns(ktime_get()));
  410. if (now < blk_stat_time(&rq->issue_stat))
  411. return;
  412. latency = now - blk_stat_time(&rq->issue_stat);
  413. if (latency > target)
  414. blk_stat_activate_msecs(kqd->cb, 10);
  415. }
  416. static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
  417. struct blk_mq_hw_ctx *hctx)
  418. {
  419. LIST_HEAD(rq_list);
  420. struct request *rq, *next;
  421. blk_mq_flush_busy_ctxs(hctx, &rq_list);
  422. list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
  423. unsigned int sched_domain;
  424. sched_domain = rq_sched_domain(rq);
  425. list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
  426. }
  427. }
  428. static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
  429. void *key)
  430. {
  431. struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
  432. list_del_init(&wait->entry);
  433. blk_mq_run_hw_queue(hctx, true);
  434. return 1;
  435. }
  436. static int kyber_get_domain_token(struct kyber_queue_data *kqd,
  437. struct kyber_hctx_data *khd,
  438. struct blk_mq_hw_ctx *hctx)
  439. {
  440. unsigned int sched_domain = khd->cur_domain;
  441. struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
  442. wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
  443. struct sbq_wait_state *ws;
  444. int nr;
  445. nr = __sbitmap_queue_get(domain_tokens);
  446. if (nr >= 0)
  447. return nr;
  448. /*
  449. * If we failed to get a domain token, make sure the hardware queue is
  450. * run when one becomes available. Note that this is serialized on
  451. * khd->lock, but we still need to be careful about the waker.
  452. */
  453. if (list_empty_careful(&wait->entry)) {
  454. init_waitqueue_func_entry(wait, kyber_domain_wake);
  455. wait->private = hctx;
  456. ws = sbq_wait_ptr(domain_tokens,
  457. &khd->wait_index[sched_domain]);
  458. add_wait_queue(&ws->wait, wait);
  459. /*
  460. * Try again in case a token was freed before we got on the wait
  461. * queue.
  462. */
  463. nr = __sbitmap_queue_get(domain_tokens);
  464. }
  465. return nr;
  466. }
  467. static struct request *
  468. kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
  469. struct kyber_hctx_data *khd,
  470. struct blk_mq_hw_ctx *hctx,
  471. bool *flushed)
  472. {
  473. struct list_head *rqs;
  474. struct request *rq;
  475. int nr;
  476. rqs = &khd->rqs[khd->cur_domain];
  477. rq = list_first_entry_or_null(rqs, struct request, queuelist);
  478. /*
  479. * If there wasn't already a pending request and we haven't flushed the
  480. * software queues yet, flush the software queues and check again.
  481. */
  482. if (!rq && !*flushed) {
  483. kyber_flush_busy_ctxs(khd, hctx);
  484. *flushed = true;
  485. rq = list_first_entry_or_null(rqs, struct request, queuelist);
  486. }
  487. if (rq) {
  488. nr = kyber_get_domain_token(kqd, khd, hctx);
  489. if (nr >= 0) {
  490. khd->batching++;
  491. rq_set_domain_token(rq, nr);
  492. list_del_init(&rq->queuelist);
  493. return rq;
  494. }
  495. }
  496. /* There were either no pending requests or no tokens. */
  497. return NULL;
  498. }
  499. static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
  500. {
  501. struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
  502. struct kyber_hctx_data *khd = hctx->sched_data;
  503. bool flushed = false;
  504. struct request *rq;
  505. int i;
  506. spin_lock(&khd->lock);
  507. /*
  508. * First, if we are still entitled to batch, try to dispatch a request
  509. * from the batch.
  510. */
  511. if (khd->batching < kyber_batch_size[khd->cur_domain]) {
  512. rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
  513. if (rq)
  514. goto out;
  515. }
  516. /*
  517. * Either,
  518. * 1. We were no longer entitled to a batch.
  519. * 2. The domain we were batching didn't have any requests.
  520. * 3. The domain we were batching was out of tokens.
  521. *
  522. * Start another batch. Note that this wraps back around to the original
  523. * domain if no other domains have requests or tokens.
  524. */
  525. khd->batching = 0;
  526. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  527. if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
  528. khd->cur_domain = 0;
  529. else
  530. khd->cur_domain++;
  531. rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
  532. if (rq)
  533. goto out;
  534. }
  535. rq = NULL;
  536. out:
  537. spin_unlock(&khd->lock);
  538. return rq;
  539. }
  540. static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
  541. {
  542. struct kyber_hctx_data *khd = hctx->sched_data;
  543. int i;
  544. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  545. if (!list_empty_careful(&khd->rqs[i]))
  546. return true;
  547. }
  548. return false;
  549. }
  550. #define KYBER_LAT_SHOW_STORE(op) \
  551. static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \
  552. char *page) \
  553. { \
  554. struct kyber_queue_data *kqd = e->elevator_data; \
  555. \
  556. return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \
  557. } \
  558. \
  559. static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
  560. const char *page, size_t count) \
  561. { \
  562. struct kyber_queue_data *kqd = e->elevator_data; \
  563. unsigned long long nsec; \
  564. int ret; \
  565. \
  566. ret = kstrtoull(page, 10, &nsec); \
  567. if (ret) \
  568. return ret; \
  569. \
  570. kqd->op##_lat_nsec = nsec; \
  571. \
  572. return count; \
  573. }
  574. KYBER_LAT_SHOW_STORE(read);
  575. KYBER_LAT_SHOW_STORE(write);
  576. #undef KYBER_LAT_SHOW_STORE
  577. #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
  578. static struct elv_fs_entry kyber_sched_attrs[] = {
  579. KYBER_LAT_ATTR(read),
  580. KYBER_LAT_ATTR(write),
  581. __ATTR_NULL
  582. };
  583. #undef KYBER_LAT_ATTR
  584. #ifdef CONFIG_BLK_DEBUG_FS
  585. #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \
  586. static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \
  587. { \
  588. struct request_queue *q = data; \
  589. struct kyber_queue_data *kqd = q->elevator->elevator_data; \
  590. \
  591. sbitmap_queue_show(&kqd->domain_tokens[domain], m); \
  592. return 0; \
  593. } \
  594. \
  595. static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos) \
  596. __acquires(&khd->lock) \
  597. { \
  598. struct blk_mq_hw_ctx *hctx = m->private; \
  599. struct kyber_hctx_data *khd = hctx->sched_data; \
  600. \
  601. spin_lock(&khd->lock); \
  602. return seq_list_start(&khd->rqs[domain], *pos); \
  603. } \
  604. \
  605. static void *kyber_##name##_rqs_next(struct seq_file *m, void *v, \
  606. loff_t *pos) \
  607. { \
  608. struct blk_mq_hw_ctx *hctx = m->private; \
  609. struct kyber_hctx_data *khd = hctx->sched_data; \
  610. \
  611. return seq_list_next(v, &khd->rqs[domain], pos); \
  612. } \
  613. \
  614. static void kyber_##name##_rqs_stop(struct seq_file *m, void *v) \
  615. __releases(&khd->lock) \
  616. { \
  617. struct blk_mq_hw_ctx *hctx = m->private; \
  618. struct kyber_hctx_data *khd = hctx->sched_data; \
  619. \
  620. spin_unlock(&khd->lock); \
  621. } \
  622. \
  623. static const struct seq_operations kyber_##name##_rqs_seq_ops = { \
  624. .start = kyber_##name##_rqs_start, \
  625. .next = kyber_##name##_rqs_next, \
  626. .stop = kyber_##name##_rqs_stop, \
  627. .show = blk_mq_debugfs_rq_show, \
  628. }; \
  629. \
  630. static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \
  631. { \
  632. struct blk_mq_hw_ctx *hctx = data; \
  633. struct kyber_hctx_data *khd = hctx->sched_data; \
  634. wait_queue_entry_t *wait = &khd->domain_wait[domain]; \
  635. \
  636. seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \
  637. return 0; \
  638. }
  639. KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
  640. KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
  641. KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
  642. #undef KYBER_DEBUGFS_DOMAIN_ATTRS
  643. static int kyber_async_depth_show(void *data, struct seq_file *m)
  644. {
  645. struct request_queue *q = data;
  646. struct kyber_queue_data *kqd = q->elevator->elevator_data;
  647. seq_printf(m, "%u\n", kqd->async_depth);
  648. return 0;
  649. }
  650. static int kyber_cur_domain_show(void *data, struct seq_file *m)
  651. {
  652. struct blk_mq_hw_ctx *hctx = data;
  653. struct kyber_hctx_data *khd = hctx->sched_data;
  654. switch (khd->cur_domain) {
  655. case KYBER_READ:
  656. seq_puts(m, "READ\n");
  657. break;
  658. case KYBER_SYNC_WRITE:
  659. seq_puts(m, "SYNC_WRITE\n");
  660. break;
  661. case KYBER_OTHER:
  662. seq_puts(m, "OTHER\n");
  663. break;
  664. default:
  665. seq_printf(m, "%u\n", khd->cur_domain);
  666. break;
  667. }
  668. return 0;
  669. }
  670. static int kyber_batching_show(void *data, struct seq_file *m)
  671. {
  672. struct blk_mq_hw_ctx *hctx = data;
  673. struct kyber_hctx_data *khd = hctx->sched_data;
  674. seq_printf(m, "%u\n", khd->batching);
  675. return 0;
  676. }
  677. #define KYBER_QUEUE_DOMAIN_ATTRS(name) \
  678. {#name "_tokens", 0400, kyber_##name##_tokens_show}
  679. static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
  680. KYBER_QUEUE_DOMAIN_ATTRS(read),
  681. KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
  682. KYBER_QUEUE_DOMAIN_ATTRS(other),
  683. {"async_depth", 0400, kyber_async_depth_show},
  684. {},
  685. };
  686. #undef KYBER_QUEUE_DOMAIN_ATTRS
  687. #define KYBER_HCTX_DOMAIN_ATTRS(name) \
  688. {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops}, \
  689. {#name "_waiting", 0400, kyber_##name##_waiting_show}
  690. static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
  691. KYBER_HCTX_DOMAIN_ATTRS(read),
  692. KYBER_HCTX_DOMAIN_ATTRS(sync_write),
  693. KYBER_HCTX_DOMAIN_ATTRS(other),
  694. {"cur_domain", 0400, kyber_cur_domain_show},
  695. {"batching", 0400, kyber_batching_show},
  696. {},
  697. };
  698. #undef KYBER_HCTX_DOMAIN_ATTRS
  699. #endif
  700. static struct elevator_type kyber_sched = {
  701. .ops.mq = {
  702. .init_sched = kyber_init_sched,
  703. .exit_sched = kyber_exit_sched,
  704. .init_hctx = kyber_init_hctx,
  705. .exit_hctx = kyber_exit_hctx,
  706. .limit_depth = kyber_limit_depth,
  707. .prepare_request = kyber_prepare_request,
  708. .finish_request = kyber_finish_request,
  709. .requeue_request = kyber_finish_request,
  710. .completed_request = kyber_completed_request,
  711. .dispatch_request = kyber_dispatch_request,
  712. .has_work = kyber_has_work,
  713. },
  714. .uses_mq = true,
  715. #ifdef CONFIG_BLK_DEBUG_FS
  716. .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
  717. .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
  718. #endif
  719. .elevator_attrs = kyber_sched_attrs,
  720. .elevator_name = "kyber",
  721. .elevator_owner = THIS_MODULE,
  722. };
  723. static int __init kyber_init(void)
  724. {
  725. return elv_register(&kyber_sched);
  726. }
  727. static void __exit kyber_exit(void)
  728. {
  729. elv_unregister(&kyber_sched);
  730. }
  731. module_init(kyber_init);
  732. module_exit(kyber_exit);
  733. MODULE_AUTHOR("Omar Sandoval");
  734. MODULE_LICENSE("GPL");
  735. MODULE_DESCRIPTION("Kyber I/O scheduler");