row-iosched.c 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092
  1. /*
  2. * ROW (Read Over Write) I/O scheduler.
  3. *
  4. * Copyright (c) 2012-2014, The Linux Foundation. All rights reserved.
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License version 2 and
  8. * only version 2 as published by the Free Software Foundation.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. */
  15. /* See Documentation/block/row-iosched.txt */
  16. #include <linux/kernel.h>
  17. #include <linux/fs.h>
  18. #include <linux/blkdev.h>
  19. #include <linux/elevator.h>
  20. #include <linux/bio.h>
  21. #include <linux/module.h>
  22. #include <linux/slab.h>
  23. #include <linux/init.h>
  24. #include <linux/compiler.h>
  25. #include <linux/blktrace_api.h>
  26. #include <linux/hrtimer.h>
  27. /*
  28. * enum row_queue_prio - Priorities of the ROW queues
  29. *
  30. * This enum defines the priorities (and the number of queues)
  31. * the requests will be distributed to. The higher priority -
  32. * the bigger is the "bus time" (or the dispatch quantum) given
  33. * to that queue.
  34. * ROWQ_PRIO_HIGH_READ - is the higher priority queue.
  35. *
  36. */
  37. enum row_queue_prio {
  38. ROWQ_PRIO_HIGH_READ = 0,
  39. ROWQ_PRIO_HIGH_SWRITE,
  40. ROWQ_PRIO_REG_READ,
  41. ROWQ_PRIO_REG_SWRITE,
  42. ROWQ_PRIO_REG_WRITE,
  43. ROWQ_PRIO_LOW_READ,
  44. ROWQ_PRIO_LOW_SWRITE,
  45. ROWQ_MAX_PRIO,
  46. };
  47. /*
  48. * The following indexes define the distribution of ROW queues according to
  49. * priorities. Each index defines the first queue in that priority group.
  50. */
  51. #define ROWQ_HIGH_PRIO_IDX ROWQ_PRIO_HIGH_READ
  52. #define ROWQ_REG_PRIO_IDX ROWQ_PRIO_REG_READ
  53. #define ROWQ_LOW_PRIO_IDX ROWQ_PRIO_LOW_READ
  54. /**
  55. * struct row_queue_params - ROW queue parameters
  56. * @idling_enabled: Flag indicating whether idling is enable on
  57. * the queue
  58. * @quantum: Number of requests to be dispatched from this queue
  59. * in a dispatch cycle
  60. * @is_urgent: Flags indicating whether the queue can notify on
  61. * urgent requests
  62. *
  63. */
  64. struct row_queue_params {
  65. bool idling_enabled;
  66. int quantum;
  67. bool is_urgent;
  68. };
  69. /*
  70. * This array holds the default values of the different configurables
  71. * for each ROW queue. Each row of the array holds the following values:
  72. * {idling_enabled, quantum, is_urgent}
  73. * Each row corresponds to a queue with the same index (according to
  74. * enum row_queue_prio)
  75. * Note: The quantums are valid inside their priority type. For example:
  76. * For every 10 high priority read requests, 1 high priority sync
  77. * write will be dispatched.
  78. * For every 100 regular read requests 1 regular write request will
  79. * be dispatched.
  80. */
  81. static const struct row_queue_params row_queues_def[] = {
  82. /* idling_enabled, quantum, is_urgent */
  83. {true, 10, true}, /* ROWQ_PRIO_HIGH_READ */
  84. {false, 1, false}, /* ROWQ_PRIO_HIGH_SWRITE */
  85. {true, 100, true}, /* ROWQ_PRIO_REG_READ */
  86. {false, 1, false}, /* ROWQ_PRIO_REG_SWRITE */
  87. {false, 1, false}, /* ROWQ_PRIO_REG_WRITE */
  88. {false, 1, false}, /* ROWQ_PRIO_LOW_READ */
  89. {false, 1, false} /* ROWQ_PRIO_LOW_SWRITE */
  90. };
  91. /* Default values for idling on read queues (in msec) */
  92. #define ROW_IDLE_TIME_MSEC 5
  93. #define ROW_READ_FREQ_MSEC 5
  94. /**
  95. * struct rowq_idling_data - parameters for idling on the queue
  96. * @last_insert_time: time the last request was inserted
  97. * to the queue
  98. * @begin_idling: flag indicating wether we should idle
  99. *
  100. */
  101. struct rowq_idling_data {
  102. ktime_t last_insert_time;
  103. bool begin_idling;
  104. };
  105. /**
  106. * struct row_queue - requests grouping structure
  107. * @rdata: parent row_data structure
  108. * @fifo: fifo of requests
  109. * @prio: queue priority (enum row_queue_prio)
  110. * @nr_dispatched: number of requests already dispatched in
  111. * the current dispatch cycle
  112. * @nr_req: number of requests in queue
  113. * @dispatch quantum: number of requests this queue may
  114. * dispatch in a dispatch cycle
  115. * @idle_data: data for idling on queues
  116. *
  117. */
  118. struct row_queue {
  119. struct row_data *rdata;
  120. struct list_head fifo;
  121. enum row_queue_prio prio;
  122. unsigned int nr_dispatched;
  123. unsigned int nr_req;
  124. int disp_quantum;
  125. /* used only for READ queues */
  126. struct rowq_idling_data idle_data;
  127. };
  128. /**
  129. * struct idling_data - data for idling on empty rqueue
  130. * @idle_time_ms: idling duration (msec)
  131. * @freq_ms: min time between two requests that
  132. * triger idling (msec)
  133. * @hr_timer: idling timer
  134. * @idle_work: the work to be scheduled when idling timer expires
  135. * @idling_queue_idx: index of the queues we're idling on
  136. *
  137. */
  138. struct idling_data {
  139. s64 idle_time_ms;
  140. s64 freq_ms;
  141. struct hrtimer hr_timer;
  142. struct work_struct idle_work;
  143. enum row_queue_prio idling_queue_idx;
  144. };
  145. /**
  146. * struct starvation_data - data for starvation management
  147. * @starvation_limit: number of times this priority class
  148. * can tolerate being starved
  149. * @starvation_counter: number of requests from higher
  150. * priority classes that were dispatched while this
  151. * priority request were pending
  152. *
  153. */
  154. struct starvation_data {
  155. int starvation_limit;
  156. int starvation_counter;
  157. };
  158. /**
  159. * struct row_queue - Per block device rqueue structure
  160. * @dispatch_queue: dispatch rqueue
  161. * @row_queues: array of priority request queues
  162. * @rd_idle_data: data for idling after READ request
  163. * @nr_reqs: nr_reqs[0] holds the number of all READ requests in
  164. * scheduler, nr_reqs[1] holds the number of all WRITE
  165. * requests in scheduler
  166. * @urgent_in_flight: flag indicating that there is an urgent
  167. * request that was dispatched to driver and is yet to
  168. * complete.
  169. * @pending_urgent_rq: pointer to the pending urgent request
  170. * @last_served_ioprio_class: I/O priority class that was last dispatched from
  171. * @reg_prio_starvation: starvation data for REGULAR priority queues
  172. * @low_prio_starvation: starvation data for LOW priority queues
  173. * @cycle_flags: used for marking unserved queueus
  174. *
  175. */
  176. struct row_data {
  177. struct request_queue *dispatch_queue;
  178. struct row_queue row_queues[ROWQ_MAX_PRIO];
  179. struct idling_data rd_idle_data;
  180. unsigned int nr_reqs[2];
  181. bool urgent_in_flight;
  182. struct request *pending_urgent_rq;
  183. int last_served_ioprio_class;
  184. #define ROW_REG_STARVATION_TOLLERANCE 5000
  185. struct starvation_data reg_prio_starvation;
  186. #define ROW_LOW_STARVATION_TOLLERANCE 10000
  187. struct starvation_data low_prio_starvation;
  188. unsigned int cycle_flags;
  189. };
  190. #define RQ_ROWQ(rq) ((struct row_queue *) ((rq)->elv.priv[0]))
  191. #define row_log(q, fmt, args...) \
  192. blk_add_trace_msg(q, "%s():" fmt , __func__, ##args)
  193. #define row_log_rowq(rdata, rowq_id, fmt, args...) \
  194. blk_add_trace_msg(rdata->dispatch_queue, "rowq%d " fmt, \
  195. rowq_id, ##args)
  196. static inline void row_mark_rowq_unserved(struct row_data *rd,
  197. enum row_queue_prio qnum)
  198. {
  199. rd->cycle_flags |= (1 << qnum);
  200. }
  201. static inline void row_clear_rowq_unserved(struct row_data *rd,
  202. enum row_queue_prio qnum)
  203. {
  204. rd->cycle_flags &= ~(1 << qnum);
  205. }
  206. static inline int row_rowq_unserved(struct row_data *rd,
  207. enum row_queue_prio qnum)
  208. {
  209. return rd->cycle_flags & (1 << qnum);
  210. }
  211. static inline void __maybe_unused row_dump_queues_stat(struct row_data *rd)
  212. {
  213. int i;
  214. row_log(rd->dispatch_queue, " Queues status:");
  215. for (i = 0; i < ROWQ_MAX_PRIO; i++)
  216. row_log(rd->dispatch_queue,
  217. "queue%d: dispatched= %d, nr_req=%d", i,
  218. rd->row_queues[i].nr_dispatched,
  219. rd->row_queues[i].nr_req);
  220. }
  221. /******************** Static helper functions ***********************/
  222. static void kick_queue(struct work_struct *work)
  223. {
  224. struct idling_data *read_data =
  225. container_of(work, struct idling_data, idle_work);
  226. struct row_data *rd =
  227. container_of(read_data, struct row_data, rd_idle_data);
  228. blk_run_queue(rd->dispatch_queue);
  229. }
  230. static enum hrtimer_restart row_idle_hrtimer_fn(struct hrtimer *hr_timer)
  231. {
  232. struct idling_data *read_data =
  233. container_of(hr_timer, struct idling_data, hr_timer);
  234. struct row_data *rd =
  235. container_of(read_data, struct row_data, rd_idle_data);
  236. row_log_rowq(rd, rd->rd_idle_data.idling_queue_idx,
  237. "Performing delayed work");
  238. /* Mark idling process as done */
  239. rd->row_queues[rd->rd_idle_data.idling_queue_idx].
  240. idle_data.begin_idling = false;
  241. rd->rd_idle_data.idling_queue_idx = ROWQ_MAX_PRIO;
  242. if (!rd->nr_reqs[READ] && !rd->nr_reqs[WRITE])
  243. row_log(rd->dispatch_queue, "No requests in scheduler");
  244. else
  245. kblockd_schedule_work(rd->dispatch_queue,
  246. &read_data->idle_work);
  247. return HRTIMER_NORESTART;
  248. }
  249. /*
  250. * row_regular_req_pending() - Check if there are REGULAR priority requests
  251. * Pending in scheduler
  252. * @rd: pointer to struct row_data
  253. *
  254. * Returns True if there are REGULAR priority requests in scheduler queues.
  255. * False, otherwise.
  256. */
  257. static inline bool row_regular_req_pending(struct row_data *rd)
  258. {
  259. int i;
  260. for (i = ROWQ_REG_PRIO_IDX; i < ROWQ_LOW_PRIO_IDX; i++)
  261. if (!list_empty(&rd->row_queues[i].fifo))
  262. return true;
  263. return false;
  264. }
  265. /*
  266. * row_low_req_pending() - Check if there are LOW priority requests
  267. * Pending in scheduler
  268. * @rd: pointer to struct row_data
  269. *
  270. * Returns True if there are LOW priority requests in scheduler queues.
  271. * False, otherwise.
  272. */
  273. static inline bool row_low_req_pending(struct row_data *rd)
  274. {
  275. int i;
  276. for (i = ROWQ_LOW_PRIO_IDX; i < ROWQ_MAX_PRIO; i++)
  277. if (!list_empty(&rd->row_queues[i].fifo))
  278. return true;
  279. return false;
  280. }
  281. /******************* Elevator callback functions *********************/
  282. /*
  283. * row_add_request() - Add request to the scheduler
  284. * @q: requests queue
  285. * @rq: request to add
  286. *
  287. */
  288. static void row_add_request(struct request_queue *q,
  289. struct request *rq)
  290. {
  291. struct row_data *rd = (struct row_data *)q->elevator->elevator_data;
  292. struct row_queue *rqueue = RQ_ROWQ(rq);
  293. s64 diff_ms;
  294. bool queue_was_empty = list_empty(&rqueue->fifo);
  295. list_add_tail(&rq->queuelist, &rqueue->fifo);
  296. rd->nr_reqs[rq_data_dir(rq)]++;
  297. rqueue->nr_req++;
  298. rq_set_fifo_time(rq, jiffies); /* for statistics*/
  299. if (rq->cmd_flags & REQ_URGENT) {
  300. WARN_ON(1);
  301. blk_dump_rq_flags(rq, "");
  302. rq->cmd_flags &= ~REQ_URGENT;
  303. }
  304. if (row_queues_def[rqueue->prio].idling_enabled) {
  305. if (rd->rd_idle_data.idling_queue_idx == rqueue->prio &&
  306. hrtimer_active(&rd->rd_idle_data.hr_timer)) {
  307. if (hrtimer_try_to_cancel(
  308. &rd->rd_idle_data.hr_timer) >= 0) {
  309. row_log_rowq(rd, rqueue->prio,
  310. "Canceled delayed work on %d",
  311. rd->rd_idle_data.idling_queue_idx);
  312. rd->rd_idle_data.idling_queue_idx =
  313. ROWQ_MAX_PRIO;
  314. }
  315. }
  316. diff_ms = ktime_to_ms(ktime_sub(ktime_get(),
  317. rqueue->idle_data.last_insert_time));
  318. if (unlikely(diff_ms < 0)) {
  319. pr_err("%s(): time delta error: diff_ms < 0",
  320. __func__);
  321. rqueue->idle_data.begin_idling = false;
  322. return;
  323. }
  324. if (diff_ms < rd->rd_idle_data.freq_ms) {
  325. rqueue->idle_data.begin_idling = true;
  326. row_log_rowq(rd, rqueue->prio, "Enable idling");
  327. } else {
  328. rqueue->idle_data.begin_idling = false;
  329. row_log_rowq(rd, rqueue->prio, "Disable idling (%ldms)",
  330. (long)diff_ms);
  331. }
  332. rqueue->idle_data.last_insert_time = ktime_get();
  333. }
  334. if (row_queues_def[rqueue->prio].is_urgent &&
  335. !rd->pending_urgent_rq && !rd->urgent_in_flight) {
  336. /* Handle High Priority queues */
  337. if (rqueue->prio < ROWQ_REG_PRIO_IDX &&
  338. rd->last_served_ioprio_class != IOPRIO_CLASS_RT &&
  339. queue_was_empty) {
  340. row_log_rowq(rd, rqueue->prio,
  341. "added (high prio) urgent request");
  342. rq->cmd_flags |= REQ_URGENT;
  343. rd->pending_urgent_rq = rq;
  344. } else if (row_rowq_unserved(rd, rqueue->prio)) {
  345. /* Handle Regular priotity queues */
  346. row_log_rowq(rd, rqueue->prio,
  347. "added urgent request (total on queue=%d)",
  348. rqueue->nr_req);
  349. rq->cmd_flags |= REQ_URGENT;
  350. rd->pending_urgent_rq = rq;
  351. }
  352. } else
  353. row_log_rowq(rd, rqueue->prio,
  354. "added request (total on queue=%d)", rqueue->nr_req);
  355. }
  356. /**
  357. * row_reinsert_req() - Reinsert request back to the scheduler
  358. * @q: requests queue
  359. * @rq: request to add
  360. *
  361. * Reinsert the given request back to the queue it was
  362. * dispatched from as if it was never dispatched.
  363. *
  364. * Returns 0 on success, error code otherwise
  365. */
  366. static int row_reinsert_req(struct request_queue *q,
  367. struct request *rq)
  368. {
  369. struct row_data *rd = q->elevator->elevator_data;
  370. struct row_queue *rqueue = RQ_ROWQ(rq);
  371. if (!rqueue || rqueue->prio >= ROWQ_MAX_PRIO)
  372. return -EIO;
  373. list_add(&rq->queuelist, &rqueue->fifo);
  374. rd->nr_reqs[rq_data_dir(rq)]++;
  375. rqueue->nr_req++;
  376. row_log_rowq(rd, rqueue->prio,
  377. "%s request reinserted (total on queue=%d)",
  378. (rq_data_dir(rq) == READ ? "READ" : "write"), rqueue->nr_req);
  379. if (rq->cmd_flags & REQ_URGENT) {
  380. /*
  381. * It's not compliant with the design to re-insert
  382. * urgent requests. We want to be able to track this
  383. * down.
  384. */
  385. WARN_ON(1);
  386. if (!rd->urgent_in_flight) {
  387. pr_err("%s(): no urgent in flight", __func__);
  388. } else {
  389. rd->urgent_in_flight = false;
  390. pr_err("%s(): reinserting URGENT %s req",
  391. __func__,
  392. (rq_data_dir(rq) == READ ? "READ" : "WRITE"));
  393. if (rd->pending_urgent_rq) {
  394. pr_err("%s(): urgent rq is pending",
  395. __func__);
  396. rd->pending_urgent_rq->cmd_flags &= ~REQ_URGENT;
  397. }
  398. rd->pending_urgent_rq = rq;
  399. }
  400. }
  401. return 0;
  402. }
  403. static void row_completed_req(struct request_queue *q, struct request *rq)
  404. {
  405. struct row_data *rd = q->elevator->elevator_data;
  406. if (rq->cmd_flags & REQ_URGENT) {
  407. if (!rd->urgent_in_flight) {
  408. WARN_ON(1);
  409. pr_err("%s(): URGENT req but urgent_in_flight = F",
  410. __func__);
  411. }
  412. rd->urgent_in_flight = false;
  413. rq->cmd_flags &= ~REQ_URGENT;
  414. }
  415. row_log(q, "completed %s %s req.",
  416. (rq->cmd_flags & REQ_URGENT ? "URGENT" : "regular"),
  417. (rq_data_dir(rq) == READ ? "READ" : "WRITE"));
  418. }
  419. /**
  420. * row_urgent_pending() - Return TRUE if there is an urgent
  421. * request on scheduler
  422. * @q: requests queue
  423. */
  424. static bool row_urgent_pending(struct request_queue *q)
  425. {
  426. struct row_data *rd = q->elevator->elevator_data;
  427. if (rd->urgent_in_flight) {
  428. row_log(rd->dispatch_queue, "%d urgent requests in flight",
  429. rd->urgent_in_flight);
  430. return false;
  431. }
  432. if (rd->pending_urgent_rq) {
  433. row_log(rd->dispatch_queue, "Urgent request pending");
  434. return true;
  435. }
  436. row_log(rd->dispatch_queue, "no urgent request pending/in flight");
  437. return false;
  438. }
  439. /**
  440. * row_remove_request() - Remove given request from scheduler
  441. * @q: requests queue
  442. * @rq: request to remove
  443. *
  444. */
  445. static void row_remove_request(struct row_data *rd,
  446. struct request *rq)
  447. {
  448. struct row_queue *rqueue = RQ_ROWQ(rq);
  449. list_del_init(&(rq)->queuelist);
  450. if (rd->pending_urgent_rq == rq)
  451. rd->pending_urgent_rq = NULL;
  452. else
  453. BUG_ON(rq->cmd_flags & REQ_URGENT);
  454. rqueue->nr_req--;
  455. rd->nr_reqs[rq_data_dir(rq)]--;
  456. }
  457. /*
  458. * row_dispatch_insert() - move request to dispatch queue
  459. * @rd: pointer to struct row_data
  460. * @rq: the request to dispatch
  461. *
  462. * This function moves the given request to the dispatch queue
  463. *
  464. */
  465. static void row_dispatch_insert(struct row_data *rd, struct request *rq)
  466. {
  467. struct row_queue *rqueue = RQ_ROWQ(rq);
  468. row_remove_request(rd, rq);
  469. elv_dispatch_sort(rd->dispatch_queue, rq);
  470. if (rq->cmd_flags & REQ_URGENT) {
  471. WARN_ON(rd->urgent_in_flight);
  472. rd->urgent_in_flight = true;
  473. }
  474. rqueue->nr_dispatched++;
  475. row_clear_rowq_unserved(rd, rqueue->prio);
  476. row_log_rowq(rd, rqueue->prio,
  477. " Dispatched request %p nr_disp = %d", rq,
  478. rqueue->nr_dispatched);
  479. if (rqueue->prio < ROWQ_REG_PRIO_IDX) {
  480. rd->last_served_ioprio_class = IOPRIO_CLASS_RT;
  481. if (row_regular_req_pending(rd))
  482. rd->reg_prio_starvation.starvation_counter++;
  483. if (row_low_req_pending(rd))
  484. rd->low_prio_starvation.starvation_counter++;
  485. } else if (rqueue->prio < ROWQ_LOW_PRIO_IDX) {
  486. rd->last_served_ioprio_class = IOPRIO_CLASS_BE;
  487. rd->reg_prio_starvation.starvation_counter = 0;
  488. if (row_low_req_pending(rd))
  489. rd->low_prio_starvation.starvation_counter++;
  490. } else {
  491. rd->last_served_ioprio_class = IOPRIO_CLASS_IDLE;
  492. rd->low_prio_starvation.starvation_counter = 0;
  493. }
  494. }
  495. /*
  496. * row_get_ioprio_class_to_serve() - Return the next I/O priority
  497. * class to dispatch requests from
  498. * @rd: pointer to struct row_data
  499. * @force: flag indicating if forced dispatch
  500. *
  501. * This function returns the next I/O priority class to serve
  502. * {IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE}.
  503. * If there are no more requests in scheduler or if we're idling on some queue
  504. * IOPRIO_CLASS_NONE will be returned.
  505. * If idling is scheduled on a lower priority queue than the one that needs
  506. * to be served, it will be canceled.
  507. *
  508. */
  509. static int row_get_ioprio_class_to_serve(struct row_data *rd, int force)
  510. {
  511. int i;
  512. int ret = IOPRIO_CLASS_NONE;
  513. if (!rd->nr_reqs[READ] && !rd->nr_reqs[WRITE]) {
  514. row_log(rd->dispatch_queue, "No more requests in scheduler");
  515. goto check_idling;
  516. }
  517. /* First, go over the high priority queues */
  518. for (i = 0; i < ROWQ_REG_PRIO_IDX; i++) {
  519. if (!list_empty(&rd->row_queues[i].fifo)) {
  520. if (hrtimer_active(&rd->rd_idle_data.hr_timer)) {
  521. if (hrtimer_try_to_cancel(
  522. &rd->rd_idle_data.hr_timer) >= 0) {
  523. row_log(rd->dispatch_queue,
  524. "Canceling delayed work on %d. RT pending",
  525. rd->rd_idle_data.idling_queue_idx);
  526. rd->rd_idle_data.idling_queue_idx =
  527. ROWQ_MAX_PRIO;
  528. }
  529. }
  530. if (row_regular_req_pending(rd) &&
  531. (rd->reg_prio_starvation.starvation_counter >=
  532. rd->reg_prio_starvation.starvation_limit))
  533. ret = IOPRIO_CLASS_BE;
  534. else if (row_low_req_pending(rd) &&
  535. (rd->low_prio_starvation.starvation_counter >=
  536. rd->low_prio_starvation.starvation_limit))
  537. ret = IOPRIO_CLASS_IDLE;
  538. else
  539. ret = IOPRIO_CLASS_RT;
  540. goto done;
  541. }
  542. }
  543. /*
  544. * At the moment idling is implemented only for READ queues.
  545. * If enabled on WRITE, this needs updating
  546. */
  547. if (hrtimer_active(&rd->rd_idle_data.hr_timer)) {
  548. row_log(rd->dispatch_queue, "Delayed work pending. Exiting");
  549. goto done;
  550. }
  551. check_idling:
  552. /* Check for (high priority) idling and enable if needed */
  553. for (i = 0; i < ROWQ_REG_PRIO_IDX && !force; i++) {
  554. if (rd->row_queues[i].idle_data.begin_idling &&
  555. row_queues_def[i].idling_enabled)
  556. goto initiate_idling;
  557. }
  558. /* Regular priority queues */
  559. for (i = ROWQ_REG_PRIO_IDX; i < ROWQ_LOW_PRIO_IDX; i++) {
  560. if (list_empty(&rd->row_queues[i].fifo)) {
  561. /* We can idle only if this is not a forced dispatch */
  562. if (rd->row_queues[i].idle_data.begin_idling &&
  563. !force && row_queues_def[i].idling_enabled)
  564. goto initiate_idling;
  565. } else {
  566. if (row_low_req_pending(rd) &&
  567. (rd->low_prio_starvation.starvation_counter >=
  568. rd->low_prio_starvation.starvation_limit))
  569. ret = IOPRIO_CLASS_IDLE;
  570. else
  571. ret = IOPRIO_CLASS_BE;
  572. goto done;
  573. }
  574. }
  575. if (rd->nr_reqs[READ] || rd->nr_reqs[WRITE])
  576. ret = IOPRIO_CLASS_IDLE;
  577. goto done;
  578. initiate_idling:
  579. hrtimer_start(&rd->rd_idle_data.hr_timer,
  580. ktime_set(0, rd->rd_idle_data.idle_time_ms * NSEC_PER_MSEC),
  581. HRTIMER_MODE_REL);
  582. rd->rd_idle_data.idling_queue_idx = i;
  583. row_log_rowq(rd, i, "Scheduled delayed work on %d. exiting", i);
  584. done:
  585. return ret;
  586. }
  587. static void row_restart_cycle(struct row_data *rd,
  588. int start_idx, int end_idx)
  589. {
  590. int i;
  591. row_dump_queues_stat(rd);
  592. for (i = start_idx; i < end_idx; i++) {
  593. if (rd->row_queues[i].nr_dispatched <
  594. rd->row_queues[i].disp_quantum)
  595. row_mark_rowq_unserved(rd, i);
  596. rd->row_queues[i].nr_dispatched = 0;
  597. }
  598. row_log(rd->dispatch_queue, "Restarting cycle for class @ %d-%d",
  599. start_idx, end_idx);
  600. }
  601. /*
  602. * row_get_next_queue() - selects the next queue to dispatch from
  603. * @q: requests queue
  604. * @rd: pointer to struct row_data
  605. * @start_idx/end_idx: indexes in the row_queues array to select a queue
  606. * from.
  607. *
  608. * Return index of the queues to dispatch from. Error code if fails.
  609. *
  610. */
  611. static int row_get_next_queue(struct request_queue *q, struct row_data *rd,
  612. int start_idx, int end_idx)
  613. {
  614. int i = start_idx;
  615. bool restart = true;
  616. int ret = -EIO;
  617. do {
  618. if (list_empty(&rd->row_queues[i].fifo) ||
  619. rd->row_queues[i].nr_dispatched >=
  620. rd->row_queues[i].disp_quantum) {
  621. i++;
  622. if (i == end_idx && restart) {
  623. /* Restart cycle for this priority class */
  624. row_restart_cycle(rd, start_idx, end_idx);
  625. i = start_idx;
  626. restart = false;
  627. }
  628. } else {
  629. ret = i;
  630. break;
  631. }
  632. } while (i < end_idx);
  633. return ret;
  634. }
  635. /*
  636. * row_dispatch_requests() - selects the next request to dispatch
  637. * @q: requests queue
  638. * @force: flag indicating if forced dispatch
  639. *
  640. * Return 0 if no requests were moved to the dispatch queue.
  641. * 1 otherwise
  642. *
  643. */
  644. static int row_dispatch_requests(struct request_queue *q, int force)
  645. {
  646. struct row_data *rd = (struct row_data *)q->elevator->elevator_data;
  647. int ret = 0, currq, ioprio_class_to_serve, start_idx, end_idx;
  648. if (force && hrtimer_active(&rd->rd_idle_data.hr_timer)) {
  649. if (hrtimer_try_to_cancel(&rd->rd_idle_data.hr_timer) >= 0) {
  650. row_log(rd->dispatch_queue,
  651. "Canceled delayed work on %d - forced dispatch",
  652. rd->rd_idle_data.idling_queue_idx);
  653. rd->rd_idle_data.idling_queue_idx = ROWQ_MAX_PRIO;
  654. }
  655. }
  656. if (rd->pending_urgent_rq) {
  657. row_log(rd->dispatch_queue, "dispatching urgent request");
  658. row_dispatch_insert(rd, rd->pending_urgent_rq);
  659. ret = 1;
  660. goto done;
  661. }
  662. ioprio_class_to_serve = row_get_ioprio_class_to_serve(rd, force);
  663. row_log(rd->dispatch_queue, "Dispatching from %d priority class",
  664. ioprio_class_to_serve);
  665. switch (ioprio_class_to_serve) {
  666. case IOPRIO_CLASS_NONE:
  667. rd->last_served_ioprio_class = IOPRIO_CLASS_NONE;
  668. goto done;
  669. case IOPRIO_CLASS_RT:
  670. start_idx = ROWQ_HIGH_PRIO_IDX;
  671. end_idx = ROWQ_REG_PRIO_IDX;
  672. break;
  673. case IOPRIO_CLASS_BE:
  674. start_idx = ROWQ_REG_PRIO_IDX;
  675. end_idx = ROWQ_LOW_PRIO_IDX;
  676. break;
  677. case IOPRIO_CLASS_IDLE:
  678. start_idx = ROWQ_LOW_PRIO_IDX;
  679. end_idx = ROWQ_MAX_PRIO;
  680. break;
  681. default:
  682. pr_err("%s(): Invalid I/O priority class", __func__);
  683. goto done;
  684. }
  685. currq = row_get_next_queue(q, rd, start_idx, end_idx);
  686. /* Dispatch */
  687. if (currq >= 0) {
  688. row_dispatch_insert(rd,
  689. rq_entry_fifo(rd->row_queues[currq].fifo.next));
  690. ret = 1;
  691. }
  692. done:
  693. return ret;
  694. }
  695. /*
  696. * row_init_queue() - Init scheduler data structures
  697. * @q: requests queue
  698. *
  699. * Return pointer to struct row_data to be saved in elevator for
  700. * this dispatch queue
  701. *
  702. */
  703. static void *row_init_queue(struct request_queue *q)
  704. {
  705. struct row_data *rdata;
  706. int i;
  707. rdata = kmalloc_node(sizeof(*rdata),
  708. GFP_KERNEL | __GFP_ZERO, q->node);
  709. if (!rdata)
  710. return NULL;
  711. memset(rdata, 0, sizeof(*rdata));
  712. for (i = 0; i < ROWQ_MAX_PRIO; i++) {
  713. INIT_LIST_HEAD(&rdata->row_queues[i].fifo);
  714. rdata->row_queues[i].disp_quantum = row_queues_def[i].quantum;
  715. rdata->row_queues[i].rdata = rdata;
  716. rdata->row_queues[i].prio = i;
  717. rdata->row_queues[i].idle_data.begin_idling = false;
  718. rdata->row_queues[i].idle_data.last_insert_time =
  719. ktime_set(0, 0);
  720. }
  721. rdata->reg_prio_starvation.starvation_limit =
  722. ROW_REG_STARVATION_TOLLERANCE;
  723. rdata->low_prio_starvation.starvation_limit =
  724. ROW_LOW_STARVATION_TOLLERANCE;
  725. /*
  726. * Currently idling is enabled only for READ queues. If we want to
  727. * enable it for write queues also, note that idling frequency will
  728. * be the same in both cases
  729. */
  730. rdata->rd_idle_data.idle_time_ms = ROW_IDLE_TIME_MSEC;
  731. rdata->rd_idle_data.freq_ms = ROW_READ_FREQ_MSEC;
  732. hrtimer_init(&rdata->rd_idle_data.hr_timer,
  733. CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  734. rdata->rd_idle_data.hr_timer.function = &row_idle_hrtimer_fn;
  735. INIT_WORK(&rdata->rd_idle_data.idle_work, kick_queue);
  736. rdata->last_served_ioprio_class = IOPRIO_CLASS_NONE;
  737. rdata->rd_idle_data.idling_queue_idx = ROWQ_MAX_PRIO;
  738. rdata->dispatch_queue = q;
  739. return rdata;
  740. }
  741. /*
  742. * row_exit_queue() - called on unloading the RAW scheduler
  743. * @e: poiner to struct elevator_queue
  744. *
  745. */
  746. static void row_exit_queue(struct elevator_queue *e)
  747. {
  748. struct row_data *rd = (struct row_data *)e->elevator_data;
  749. int i;
  750. for (i = 0; i < ROWQ_MAX_PRIO; i++)
  751. BUG_ON(!list_empty(&rd->row_queues[i].fifo));
  752. if (hrtimer_cancel(&rd->rd_idle_data.hr_timer))
  753. pr_err("%s(): idle timer was active!", __func__);
  754. rd->rd_idle_data.idling_queue_idx = ROWQ_MAX_PRIO;
  755. kfree(rd);
  756. }
  757. /*
  758. * row_merged_requests() - Called when 2 requests are merged
  759. * @q: requests queue
  760. * @rq: request the two requests were merged into
  761. * @next: request that was merged
  762. */
  763. static void row_merged_requests(struct request_queue *q, struct request *rq,
  764. struct request *next)
  765. {
  766. struct row_queue *rqueue = RQ_ROWQ(next);
  767. list_del_init(&next->queuelist);
  768. rqueue->nr_req--;
  769. if (rqueue->rdata->pending_urgent_rq == next) {
  770. pr_err("\n\nROW_WARNING: merging pending urgent!");
  771. rqueue->rdata->pending_urgent_rq = rq;
  772. rq->cmd_flags |= REQ_URGENT;
  773. WARN_ON(!(next->cmd_flags & REQ_URGENT));
  774. next->cmd_flags &= ~REQ_URGENT;
  775. }
  776. rqueue->rdata->nr_reqs[rq_data_dir(rq)]--;
  777. }
  778. /*
  779. * row_get_queue_prio() - Get queue priority for a given request
  780. *
  781. * This is a helping function which purpose is to determine what
  782. * ROW queue the given request should be added to (and
  783. * dispatched from later on)
  784. *
  785. */
  786. static enum row_queue_prio row_get_queue_prio(struct request *rq,
  787. struct row_data *rd)
  788. {
  789. const int data_dir = rq_data_dir(rq);
  790. const bool is_sync = rq_is_sync(rq);
  791. enum row_queue_prio q_type = ROWQ_MAX_PRIO;
  792. int ioprio_class = IOPRIO_PRIO_CLASS(rq->elv.icq->ioc->ioprio);
  793. switch (ioprio_class) {
  794. case IOPRIO_CLASS_RT:
  795. if (data_dir == READ)
  796. q_type = ROWQ_PRIO_HIGH_READ;
  797. else if (is_sync)
  798. q_type = ROWQ_PRIO_HIGH_SWRITE;
  799. else {
  800. pr_err("%s:%s(): got a simple write from RT_CLASS. How???",
  801. rq->rq_disk->disk_name, __func__);
  802. q_type = ROWQ_PRIO_REG_WRITE;
  803. }
  804. break;
  805. case IOPRIO_CLASS_IDLE:
  806. if (data_dir == READ)
  807. q_type = ROWQ_PRIO_LOW_READ;
  808. else if (is_sync)
  809. q_type = ROWQ_PRIO_LOW_SWRITE;
  810. else {
  811. pr_err("%s:%s(): got a simple write from IDLE_CLASS. How???",
  812. rq->rq_disk->disk_name, __func__);
  813. q_type = ROWQ_PRIO_REG_WRITE;
  814. }
  815. break;
  816. case IOPRIO_CLASS_NONE:
  817. case IOPRIO_CLASS_BE:
  818. default:
  819. if (data_dir == READ)
  820. q_type = ROWQ_PRIO_REG_READ;
  821. else if (is_sync)
  822. q_type = ROWQ_PRIO_REG_SWRITE;
  823. else
  824. q_type = ROWQ_PRIO_REG_WRITE;
  825. break;
  826. }
  827. return q_type;
  828. }
  829. /*
  830. * row_set_request() - Set ROW data structures associated with this request.
  831. * @q: requests queue
  832. * @rq: pointer to the request
  833. * @gfp_mask: ignored
  834. *
  835. */
  836. static int
  837. row_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
  838. {
  839. struct row_data *rd = (struct row_data *)q->elevator->elevator_data;
  840. unsigned long flags;
  841. spin_lock_irqsave(q->queue_lock, flags);
  842. rq->elv.priv[0] =
  843. (void *)(&rd->row_queues[row_get_queue_prio(rq, rd)]);
  844. spin_unlock_irqrestore(q->queue_lock, flags);
  845. return 0;
  846. }
  847. /********** Helping sysfs functions/defenitions for ROW attributes ******/
  848. static ssize_t row_var_show(int var, char *page)
  849. {
  850. return snprintf(page, 100, "%d\n", var);
  851. }
  852. static ssize_t row_var_store(int *var, const char *page, size_t count)
  853. {
  854. int err;
  855. err = kstrtoul(page, 10, (unsigned long *)var);
  856. return count;
  857. }
  858. #define SHOW_FUNCTION(__FUNC, __VAR) \
  859. static ssize_t __FUNC(struct elevator_queue *e, char *page) \
  860. { \
  861. struct row_data *rowd = e->elevator_data; \
  862. int __data = __VAR; \
  863. return row_var_show(__data, (page)); \
  864. }
  865. SHOW_FUNCTION(row_hp_read_quantum_show,
  866. rowd->row_queues[ROWQ_PRIO_HIGH_READ].disp_quantum);
  867. SHOW_FUNCTION(row_rp_read_quantum_show,
  868. rowd->row_queues[ROWQ_PRIO_REG_READ].disp_quantum);
  869. SHOW_FUNCTION(row_hp_swrite_quantum_show,
  870. rowd->row_queues[ROWQ_PRIO_HIGH_SWRITE].disp_quantum);
  871. SHOW_FUNCTION(row_rp_swrite_quantum_show,
  872. rowd->row_queues[ROWQ_PRIO_REG_SWRITE].disp_quantum);
  873. SHOW_FUNCTION(row_rp_write_quantum_show,
  874. rowd->row_queues[ROWQ_PRIO_REG_WRITE].disp_quantum);
  875. SHOW_FUNCTION(row_lp_read_quantum_show,
  876. rowd->row_queues[ROWQ_PRIO_LOW_READ].disp_quantum);
  877. SHOW_FUNCTION(row_lp_swrite_quantum_show,
  878. rowd->row_queues[ROWQ_PRIO_LOW_SWRITE].disp_quantum);
  879. SHOW_FUNCTION(row_rd_idle_data_show, rowd->rd_idle_data.idle_time_ms);
  880. SHOW_FUNCTION(row_rd_idle_data_freq_show, rowd->rd_idle_data.freq_ms);
  881. SHOW_FUNCTION(row_reg_starv_limit_show,
  882. rowd->reg_prio_starvation.starvation_limit);
  883. SHOW_FUNCTION(row_low_starv_limit_show,
  884. rowd->low_prio_starvation.starvation_limit);
  885. #undef SHOW_FUNCTION
  886. #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
  887. static ssize_t __FUNC(struct elevator_queue *e, \
  888. const char *page, size_t count) \
  889. { \
  890. struct row_data *rowd = e->elevator_data; \
  891. int __data; \
  892. int ret = row_var_store(&__data, (page), count); \
  893. if (__data < (MIN)) \
  894. __data = (MIN); \
  895. else if (__data > (MAX)) \
  896. __data = (MAX); \
  897. *(__PTR) = __data; \
  898. return ret; \
  899. }
  900. STORE_FUNCTION(row_hp_read_quantum_store,
  901. &rowd->row_queues[ROWQ_PRIO_HIGH_READ].disp_quantum, 1, INT_MAX);
  902. STORE_FUNCTION(row_rp_read_quantum_store,
  903. &rowd->row_queues[ROWQ_PRIO_REG_READ].disp_quantum,
  904. 1, INT_MAX);
  905. STORE_FUNCTION(row_hp_swrite_quantum_store,
  906. &rowd->row_queues[ROWQ_PRIO_HIGH_SWRITE].disp_quantum,
  907. 1, INT_MAX);
  908. STORE_FUNCTION(row_rp_swrite_quantum_store,
  909. &rowd->row_queues[ROWQ_PRIO_REG_SWRITE].disp_quantum,
  910. 1, INT_MAX);
  911. STORE_FUNCTION(row_rp_write_quantum_store,
  912. &rowd->row_queues[ROWQ_PRIO_REG_WRITE].disp_quantum,
  913. 1, INT_MAX);
  914. STORE_FUNCTION(row_lp_read_quantum_store,
  915. &rowd->row_queues[ROWQ_PRIO_LOW_READ].disp_quantum,
  916. 1, INT_MAX);
  917. STORE_FUNCTION(row_lp_swrite_quantum_store,
  918. &rowd->row_queues[ROWQ_PRIO_LOW_SWRITE].disp_quantum,
  919. 1, INT_MAX);
  920. STORE_FUNCTION(row_rd_idle_data_store, &rowd->rd_idle_data.idle_time_ms,
  921. 1, INT_MAX);
  922. STORE_FUNCTION(row_rd_idle_data_freq_store, &rowd->rd_idle_data.freq_ms,
  923. 1, INT_MAX);
  924. STORE_FUNCTION(row_reg_starv_limit_store,
  925. &rowd->reg_prio_starvation.starvation_limit,
  926. 1, INT_MAX);
  927. STORE_FUNCTION(row_low_starv_limit_store,
  928. &rowd->low_prio_starvation.starvation_limit,
  929. 1, INT_MAX);
  930. #undef STORE_FUNCTION
  931. #define ROW_ATTR(name) \
  932. __ATTR(name, S_IRUGO|S_IWUSR, row_##name##_show, \
  933. row_##name##_store)
  934. static struct elv_fs_entry row_attrs[] = {
  935. ROW_ATTR(hp_read_quantum),
  936. ROW_ATTR(rp_read_quantum),
  937. ROW_ATTR(hp_swrite_quantum),
  938. ROW_ATTR(rp_swrite_quantum),
  939. ROW_ATTR(rp_write_quantum),
  940. ROW_ATTR(lp_read_quantum),
  941. ROW_ATTR(lp_swrite_quantum),
  942. ROW_ATTR(rd_idle_data),
  943. ROW_ATTR(rd_idle_data_freq),
  944. ROW_ATTR(reg_starv_limit),
  945. ROW_ATTR(low_starv_limit),
  946. __ATTR_NULL
  947. };
  948. static struct elevator_type iosched_row = {
  949. .ops = {
  950. .elevator_merge_req_fn = row_merged_requests,
  951. .elevator_dispatch_fn = row_dispatch_requests,
  952. .elevator_add_req_fn = row_add_request,
  953. .elevator_reinsert_req_fn = row_reinsert_req,
  954. .elevator_is_urgent_fn = row_urgent_pending,
  955. .elevator_completed_req_fn = row_completed_req,
  956. .elevator_former_req_fn = elv_rb_former_request,
  957. .elevator_latter_req_fn = elv_rb_latter_request,
  958. .elevator_set_req_fn = row_set_request,
  959. .elevator_init_fn = row_init_queue,
  960. .elevator_exit_fn = row_exit_queue,
  961. },
  962. .icq_size = sizeof(struct io_cq),
  963. .icq_align = __alignof__(struct io_cq),
  964. .elevator_attrs = row_attrs,
  965. .elevator_name = "row",
  966. .elevator_owner = THIS_MODULE,
  967. };
  968. static int __init row_init(void)
  969. {
  970. elv_register(&iosched_row);
  971. return 0;
  972. }
  973. static void __exit row_exit(void)
  974. {
  975. elv_unregister(&iosched_row);
  976. }
  977. module_init(row_init);
  978. module_exit(row_exit);
  979. MODULE_LICENSE("GPLv2");
  980. MODULE_DESCRIPTION("Read Over Write IO scheduler");