blkback.c 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506
  1. /******************************************************************************
  2. *
  3. * Back-end of the driver for virtual block devices. This portion of the
  4. * driver exports a 'unified' block-device interface that can be accessed
  5. * by any operating system that implements a compatible front end. A
  6. * reference front-end implementation can be found in:
  7. * drivers/block/xen-blkfront.c
  8. *
  9. * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
  10. * Copyright (c) 2005, Christopher Clark
  11. *
  12. * This program is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU General Public License version 2
  14. * as published by the Free Software Foundation; or, when distributed
  15. * separately from the Linux kernel or incorporated into other
  16. * software packages, subject to the following license:
  17. *
  18. * Permission is hereby granted, free of charge, to any person obtaining a copy
  19. * of this source file (the "Software"), to deal in the Software without
  20. * restriction, including without limitation the rights to use, copy, modify,
  21. * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  22. * and to permit persons to whom the Software is furnished to do so, subject to
  23. * the following conditions:
  24. *
  25. * The above copyright notice and this permission notice shall be included in
  26. * all copies or substantial portions of the Software.
  27. *
  28. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  29. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  30. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  31. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  32. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  33. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  34. * IN THE SOFTWARE.
  35. */
  36. #define pr_fmt(fmt) "xen-blkback: " fmt
  37. #include <linux/spinlock.h>
  38. #include <linux/kthread.h>
  39. #include <linux/list.h>
  40. #include <linux/delay.h>
  41. #include <linux/freezer.h>
  42. #include <linux/bitmap.h>
  43. #include <xen/events.h>
  44. #include <xen/page.h>
  45. #include <xen/xen.h>
  46. #include <asm/xen/hypervisor.h>
  47. #include <asm/xen/hypercall.h>
  48. #include <xen/balloon.h>
  49. #include <xen/grant_table.h>
  50. #include "common.h"
  51. /*
  52. * Maximum number of unused free pages to keep in the internal buffer.
  53. * Setting this to a value too low will reduce memory used in each backend,
  54. * but can have a performance penalty.
  55. *
  56. * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
  57. * be set to a lower value that might degrade performance on some intensive
  58. * IO workloads.
  59. */
  60. static int xen_blkif_max_buffer_pages = 1024;
  61. module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644);
  62. MODULE_PARM_DESC(max_buffer_pages,
  63. "Maximum number of free pages to keep in each block backend buffer");
  64. /*
  65. * Maximum number of grants to map persistently in blkback. For maximum
  66. * performance this should be the total numbers of grants that can be used
  67. * to fill the ring, but since this might become too high, specially with
  68. * the use of indirect descriptors, we set it to a value that provides good
  69. * performance without using too much memory.
  70. *
  71. * When the list of persistent grants is full we clean it up using a LRU
  72. * algorithm.
  73. */
  74. static int xen_blkif_max_pgrants = 1056;
  75. module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
  76. MODULE_PARM_DESC(max_persistent_grants,
  77. "Maximum number of grants to map persistently");
  78. /*
  79. * Maximum number of rings/queues blkback supports, allow as many queues as there
  80. * are CPUs if user has not specified a value.
  81. */
  82. unsigned int xenblk_max_queues;
  83. module_param_named(max_queues, xenblk_max_queues, uint, 0644);
  84. MODULE_PARM_DESC(max_queues,
  85. "Maximum number of hardware queues per virtual disk." \
  86. "By default it is the number of online CPUs.");
  87. /*
  88. * Maximum order of pages to be used for the shared ring between front and
  89. * backend, 4KB page granularity is used.
  90. */
  91. unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
  92. module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
  93. MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
  94. /*
  95. * The LRU mechanism to clean the lists of persistent grants needs to
  96. * be executed periodically. The time interval between consecutive executions
  97. * of the purge mechanism is set in ms.
  98. */
  99. #define LRU_INTERVAL 100
  100. /*
  101. * When the persistent grants list is full we will remove unused grants
  102. * from the list. The percent number of grants to be removed at each LRU
  103. * execution.
  104. */
  105. #define LRU_PERCENT_CLEAN 5
  106. /* Run-time switchable: /sys/module/blkback/parameters/ */
  107. static unsigned int log_stats;
  108. module_param(log_stats, int, 0644);
  109. #define BLKBACK_INVALID_HANDLE (~0)
  110. /* Number of free pages to remove on each call to gnttab_free_pages */
  111. #define NUM_BATCH_FREE_PAGES 10
  112. static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page)
  113. {
  114. unsigned long flags;
  115. spin_lock_irqsave(&ring->free_pages_lock, flags);
  116. if (list_empty(&ring->free_pages)) {
  117. BUG_ON(ring->free_pages_num != 0);
  118. spin_unlock_irqrestore(&ring->free_pages_lock, flags);
  119. return gnttab_alloc_pages(1, page);
  120. }
  121. BUG_ON(ring->free_pages_num == 0);
  122. page[0] = list_first_entry(&ring->free_pages, struct page, lru);
  123. list_del(&page[0]->lru);
  124. ring->free_pages_num--;
  125. spin_unlock_irqrestore(&ring->free_pages_lock, flags);
  126. return 0;
  127. }
  128. static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **page,
  129. int num)
  130. {
  131. unsigned long flags;
  132. int i;
  133. spin_lock_irqsave(&ring->free_pages_lock, flags);
  134. for (i = 0; i < num; i++)
  135. list_add(&page[i]->lru, &ring->free_pages);
  136. ring->free_pages_num += num;
  137. spin_unlock_irqrestore(&ring->free_pages_lock, flags);
  138. }
  139. static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num)
  140. {
  141. /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
  142. struct page *page[NUM_BATCH_FREE_PAGES];
  143. unsigned int num_pages = 0;
  144. unsigned long flags;
  145. spin_lock_irqsave(&ring->free_pages_lock, flags);
  146. while (ring->free_pages_num > num) {
  147. BUG_ON(list_empty(&ring->free_pages));
  148. page[num_pages] = list_first_entry(&ring->free_pages,
  149. struct page, lru);
  150. list_del(&page[num_pages]->lru);
  151. ring->free_pages_num--;
  152. if (++num_pages == NUM_BATCH_FREE_PAGES) {
  153. spin_unlock_irqrestore(&ring->free_pages_lock, flags);
  154. gnttab_free_pages(num_pages, page);
  155. spin_lock_irqsave(&ring->free_pages_lock, flags);
  156. num_pages = 0;
  157. }
  158. }
  159. spin_unlock_irqrestore(&ring->free_pages_lock, flags);
  160. if (num_pages != 0)
  161. gnttab_free_pages(num_pages, page);
  162. }
  163. #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
  164. static int do_block_io_op(struct xen_blkif_ring *ring);
  165. static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
  166. struct blkif_request *req,
  167. struct pending_req *pending_req);
  168. static void make_response(struct xen_blkif_ring *ring, u64 id,
  169. unsigned short op, int st);
  170. #define foreach_grant_safe(pos, n, rbtree, node) \
  171. for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \
  172. (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \
  173. &(pos)->node != NULL; \
  174. (pos) = container_of(n, typeof(*(pos)), node), \
  175. (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
  176. /*
  177. * We don't need locking around the persistent grant helpers
  178. * because blkback uses a single-thread for each backend, so we
  179. * can be sure that this functions will never be called recursively.
  180. *
  181. * The only exception to that is put_persistent_grant, that can be called
  182. * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
  183. * bit operations to modify the flags of a persistent grant and to count
  184. * the number of used grants.
  185. */
  186. static int add_persistent_gnt(struct xen_blkif_ring *ring,
  187. struct persistent_gnt *persistent_gnt)
  188. {
  189. struct rb_node **new = NULL, *parent = NULL;
  190. struct persistent_gnt *this;
  191. struct xen_blkif *blkif = ring->blkif;
  192. if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) {
  193. if (!blkif->vbd.overflow_max_grants)
  194. blkif->vbd.overflow_max_grants = 1;
  195. return -EBUSY;
  196. }
  197. /* Figure out where to put new node */
  198. new = &ring->persistent_gnts.rb_node;
  199. while (*new) {
  200. this = container_of(*new, struct persistent_gnt, node);
  201. parent = *new;
  202. if (persistent_gnt->gnt < this->gnt)
  203. new = &((*new)->rb_left);
  204. else if (persistent_gnt->gnt > this->gnt)
  205. new = &((*new)->rb_right);
  206. else {
  207. pr_alert_ratelimited("trying to add a gref that's already in the tree\n");
  208. return -EINVAL;
  209. }
  210. }
  211. bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE);
  212. set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
  213. /* Add new node and rebalance tree. */
  214. rb_link_node(&(persistent_gnt->node), parent, new);
  215. rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
  216. ring->persistent_gnt_c++;
  217. atomic_inc(&ring->persistent_gnt_in_use);
  218. return 0;
  219. }
  220. static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring,
  221. grant_ref_t gref)
  222. {
  223. struct persistent_gnt *data;
  224. struct rb_node *node = NULL;
  225. node = ring->persistent_gnts.rb_node;
  226. while (node) {
  227. data = container_of(node, struct persistent_gnt, node);
  228. if (gref < data->gnt)
  229. node = node->rb_left;
  230. else if (gref > data->gnt)
  231. node = node->rb_right;
  232. else {
  233. if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) {
  234. pr_alert_ratelimited("requesting a grant already in use\n");
  235. return NULL;
  236. }
  237. set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
  238. atomic_inc(&ring->persistent_gnt_in_use);
  239. return data;
  240. }
  241. }
  242. return NULL;
  243. }
  244. static void put_persistent_gnt(struct xen_blkif_ring *ring,
  245. struct persistent_gnt *persistent_gnt)
  246. {
  247. if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
  248. pr_alert_ratelimited("freeing a grant already unused\n");
  249. set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
  250. clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
  251. atomic_dec(&ring->persistent_gnt_in_use);
  252. }
  253. static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root,
  254. unsigned int num)
  255. {
  256. struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  257. struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  258. struct persistent_gnt *persistent_gnt;
  259. struct rb_node *n;
  260. int segs_to_unmap = 0;
  261. struct gntab_unmap_queue_data unmap_data;
  262. unmap_data.pages = pages;
  263. unmap_data.unmap_ops = unmap;
  264. unmap_data.kunmap_ops = NULL;
  265. foreach_grant_safe(persistent_gnt, n, root, node) {
  266. BUG_ON(persistent_gnt->handle ==
  267. BLKBACK_INVALID_HANDLE);
  268. gnttab_set_unmap_op(&unmap[segs_to_unmap],
  269. (unsigned long) pfn_to_kaddr(page_to_pfn(
  270. persistent_gnt->page)),
  271. GNTMAP_host_map,
  272. persistent_gnt->handle);
  273. pages[segs_to_unmap] = persistent_gnt->page;
  274. if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
  275. !rb_next(&persistent_gnt->node)) {
  276. unmap_data.count = segs_to_unmap;
  277. BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
  278. put_free_pages(ring, pages, segs_to_unmap);
  279. segs_to_unmap = 0;
  280. }
  281. rb_erase(&persistent_gnt->node, root);
  282. kfree(persistent_gnt);
  283. num--;
  284. }
  285. BUG_ON(num != 0);
  286. }
  287. void xen_blkbk_unmap_purged_grants(struct work_struct *work)
  288. {
  289. struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  290. struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  291. struct persistent_gnt *persistent_gnt;
  292. int segs_to_unmap = 0;
  293. struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work);
  294. struct gntab_unmap_queue_data unmap_data;
  295. unmap_data.pages = pages;
  296. unmap_data.unmap_ops = unmap;
  297. unmap_data.kunmap_ops = NULL;
  298. while(!list_empty(&ring->persistent_purge_list)) {
  299. persistent_gnt = list_first_entry(&ring->persistent_purge_list,
  300. struct persistent_gnt,
  301. remove_node);
  302. list_del(&persistent_gnt->remove_node);
  303. gnttab_set_unmap_op(&unmap[segs_to_unmap],
  304. vaddr(persistent_gnt->page),
  305. GNTMAP_host_map,
  306. persistent_gnt->handle);
  307. pages[segs_to_unmap] = persistent_gnt->page;
  308. if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
  309. unmap_data.count = segs_to_unmap;
  310. BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
  311. put_free_pages(ring, pages, segs_to_unmap);
  312. segs_to_unmap = 0;
  313. }
  314. kfree(persistent_gnt);
  315. }
  316. if (segs_to_unmap > 0) {
  317. unmap_data.count = segs_to_unmap;
  318. BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
  319. put_free_pages(ring, pages, segs_to_unmap);
  320. }
  321. }
  322. static void purge_persistent_gnt(struct xen_blkif_ring *ring)
  323. {
  324. struct persistent_gnt *persistent_gnt;
  325. struct rb_node *n;
  326. unsigned int num_clean, total;
  327. bool scan_used = false, clean_used = false;
  328. struct rb_root *root;
  329. if (ring->persistent_gnt_c < xen_blkif_max_pgrants ||
  330. (ring->persistent_gnt_c == xen_blkif_max_pgrants &&
  331. !ring->blkif->vbd.overflow_max_grants)) {
  332. goto out;
  333. }
  334. if (work_busy(&ring->persistent_purge_work)) {
  335. pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
  336. goto out;
  337. }
  338. num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
  339. num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
  340. num_clean = min(ring->persistent_gnt_c, num_clean);
  341. if ((num_clean == 0) ||
  342. (num_clean > (ring->persistent_gnt_c - atomic_read(&ring->persistent_gnt_in_use))))
  343. goto out;
  344. /*
  345. * At this point, we can assure that there will be no calls
  346. * to get_persistent_grant (because we are executing this code from
  347. * xen_blkif_schedule), there can only be calls to put_persistent_gnt,
  348. * which means that the number of currently used grants will go down,
  349. * but never up, so we will always be able to remove the requested
  350. * number of grants.
  351. */
  352. total = num_clean;
  353. pr_debug("Going to purge %u persistent grants\n", num_clean);
  354. BUG_ON(!list_empty(&ring->persistent_purge_list));
  355. root = &ring->persistent_gnts;
  356. purge_list:
  357. foreach_grant_safe(persistent_gnt, n, root, node) {
  358. BUG_ON(persistent_gnt->handle ==
  359. BLKBACK_INVALID_HANDLE);
  360. if (clean_used) {
  361. clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
  362. continue;
  363. }
  364. if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
  365. continue;
  366. if (!scan_used &&
  367. (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags)))
  368. continue;
  369. rb_erase(&persistent_gnt->node, root);
  370. list_add(&persistent_gnt->remove_node,
  371. &ring->persistent_purge_list);
  372. if (--num_clean == 0)
  373. goto finished;
  374. }
  375. /*
  376. * If we get here it means we also need to start cleaning
  377. * grants that were used since last purge in order to cope
  378. * with the requested num
  379. */
  380. if (!scan_used && !clean_used) {
  381. pr_debug("Still missing %u purged frames\n", num_clean);
  382. scan_used = true;
  383. goto purge_list;
  384. }
  385. finished:
  386. if (!clean_used) {
  387. pr_debug("Finished scanning for grants to clean, removing used flag\n");
  388. clean_used = true;
  389. goto purge_list;
  390. }
  391. ring->persistent_gnt_c -= (total - num_clean);
  392. ring->blkif->vbd.overflow_max_grants = 0;
  393. /* We can defer this work */
  394. schedule_work(&ring->persistent_purge_work);
  395. pr_debug("Purged %u/%u\n", (total - num_clean), total);
  396. out:
  397. return;
  398. }
  399. /*
  400. * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
  401. */
  402. static struct pending_req *alloc_req(struct xen_blkif_ring *ring)
  403. {
  404. struct pending_req *req = NULL;
  405. unsigned long flags;
  406. spin_lock_irqsave(&ring->pending_free_lock, flags);
  407. if (!list_empty(&ring->pending_free)) {
  408. req = list_entry(ring->pending_free.next, struct pending_req,
  409. free_list);
  410. list_del(&req->free_list);
  411. }
  412. spin_unlock_irqrestore(&ring->pending_free_lock, flags);
  413. return req;
  414. }
  415. /*
  416. * Return the 'pending_req' structure back to the freepool. We also
  417. * wake up the thread if it was waiting for a free page.
  418. */
  419. static void free_req(struct xen_blkif_ring *ring, struct pending_req *req)
  420. {
  421. unsigned long flags;
  422. int was_empty;
  423. spin_lock_irqsave(&ring->pending_free_lock, flags);
  424. was_empty = list_empty(&ring->pending_free);
  425. list_add(&req->free_list, &ring->pending_free);
  426. spin_unlock_irqrestore(&ring->pending_free_lock, flags);
  427. if (was_empty)
  428. wake_up(&ring->pending_free_wq);
  429. }
  430. /*
  431. * Routines for managing virtual block devices (vbds).
  432. */
  433. static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
  434. int operation)
  435. {
  436. struct xen_vbd *vbd = &blkif->vbd;
  437. int rc = -EACCES;
  438. if ((operation != REQ_OP_READ) && vbd->readonly)
  439. goto out;
  440. if (likely(req->nr_sects)) {
  441. blkif_sector_t end = req->sector_number + req->nr_sects;
  442. if (unlikely(end < req->sector_number))
  443. goto out;
  444. if (unlikely(end > vbd_sz(vbd)))
  445. goto out;
  446. }
  447. req->dev = vbd->pdevice;
  448. req->bdev = vbd->bdev;
  449. rc = 0;
  450. out:
  451. return rc;
  452. }
  453. static void xen_vbd_resize(struct xen_blkif *blkif)
  454. {
  455. struct xen_vbd *vbd = &blkif->vbd;
  456. struct xenbus_transaction xbt;
  457. int err;
  458. struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
  459. unsigned long long new_size = vbd_sz(vbd);
  460. pr_info("VBD Resize: Domid: %d, Device: (%d, %d)\n",
  461. blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
  462. pr_info("VBD Resize: new size %llu\n", new_size);
  463. vbd->size = new_size;
  464. again:
  465. err = xenbus_transaction_start(&xbt);
  466. if (err) {
  467. pr_warn("Error starting transaction\n");
  468. return;
  469. }
  470. err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
  471. (unsigned long long)vbd_sz(vbd));
  472. if (err) {
  473. pr_warn("Error writing new size\n");
  474. goto abort;
  475. }
  476. /*
  477. * Write the current state; we will use this to synchronize
  478. * the front-end. If the current state is "connected" the
  479. * front-end will get the new size information online.
  480. */
  481. err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
  482. if (err) {
  483. pr_warn("Error writing the state\n");
  484. goto abort;
  485. }
  486. err = xenbus_transaction_end(xbt, 0);
  487. if (err == -EAGAIN)
  488. goto again;
  489. if (err)
  490. pr_warn("Error ending transaction\n");
  491. return;
  492. abort:
  493. xenbus_transaction_end(xbt, 1);
  494. }
  495. /*
  496. * Notification from the guest OS.
  497. */
  498. static void blkif_notify_work(struct xen_blkif_ring *ring)
  499. {
  500. ring->waiting_reqs = 1;
  501. wake_up(&ring->wq);
  502. }
  503. irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
  504. {
  505. blkif_notify_work(dev_id);
  506. return IRQ_HANDLED;
  507. }
  508. /*
  509. * SCHEDULER FUNCTIONS
  510. */
  511. static void print_stats(struct xen_blkif_ring *ring)
  512. {
  513. pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu"
  514. " | ds %4llu | pg: %4u/%4d\n",
  515. current->comm, ring->st_oo_req,
  516. ring->st_rd_req, ring->st_wr_req,
  517. ring->st_f_req, ring->st_ds_req,
  518. ring->persistent_gnt_c,
  519. xen_blkif_max_pgrants);
  520. ring->st_print = jiffies + msecs_to_jiffies(10 * 1000);
  521. ring->st_rd_req = 0;
  522. ring->st_wr_req = 0;
  523. ring->st_oo_req = 0;
  524. ring->st_ds_req = 0;
  525. }
  526. int xen_blkif_schedule(void *arg)
  527. {
  528. struct xen_blkif_ring *ring = arg;
  529. struct xen_blkif *blkif = ring->blkif;
  530. struct xen_vbd *vbd = &blkif->vbd;
  531. unsigned long timeout;
  532. int ret;
  533. xen_blkif_get(blkif);
  534. set_freezable();
  535. while (!kthread_should_stop()) {
  536. if (try_to_freeze())
  537. continue;
  538. if (unlikely(vbd->size != vbd_sz(vbd)))
  539. xen_vbd_resize(blkif);
  540. timeout = msecs_to_jiffies(LRU_INTERVAL);
  541. timeout = wait_event_interruptible_timeout(
  542. ring->wq,
  543. ring->waiting_reqs || kthread_should_stop(),
  544. timeout);
  545. if (timeout == 0)
  546. goto purge_gnt_list;
  547. timeout = wait_event_interruptible_timeout(
  548. ring->pending_free_wq,
  549. !list_empty(&ring->pending_free) ||
  550. kthread_should_stop(),
  551. timeout);
  552. if (timeout == 0)
  553. goto purge_gnt_list;
  554. ring->waiting_reqs = 0;
  555. smp_mb(); /* clear flag *before* checking for work */
  556. ret = do_block_io_op(ring);
  557. if (ret > 0)
  558. ring->waiting_reqs = 1;
  559. if (ret == -EACCES)
  560. wait_event_interruptible(ring->shutdown_wq,
  561. kthread_should_stop());
  562. purge_gnt_list:
  563. if (blkif->vbd.feature_gnt_persistent &&
  564. time_after(jiffies, ring->next_lru)) {
  565. purge_persistent_gnt(ring);
  566. ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
  567. }
  568. /* Shrink if we have more than xen_blkif_max_buffer_pages */
  569. shrink_free_pagepool(ring, xen_blkif_max_buffer_pages);
  570. if (log_stats && time_after(jiffies, ring->st_print))
  571. print_stats(ring);
  572. }
  573. /* Drain pending purge work */
  574. flush_work(&ring->persistent_purge_work);
  575. if (log_stats)
  576. print_stats(ring);
  577. ring->xenblkd = NULL;
  578. xen_blkif_put(blkif);
  579. return 0;
  580. }
  581. /*
  582. * Remove persistent grants and empty the pool of free pages
  583. */
  584. void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
  585. {
  586. /* Free all persistent grant pages */
  587. if (!RB_EMPTY_ROOT(&ring->persistent_gnts))
  588. free_persistent_gnts(ring, &ring->persistent_gnts,
  589. ring->persistent_gnt_c);
  590. BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
  591. ring->persistent_gnt_c = 0;
  592. /* Since we are shutting down remove all pages from the buffer */
  593. shrink_free_pagepool(ring, 0 /* All */);
  594. }
  595. static unsigned int xen_blkbk_unmap_prepare(
  596. struct xen_blkif_ring *ring,
  597. struct grant_page **pages,
  598. unsigned int num,
  599. struct gnttab_unmap_grant_ref *unmap_ops,
  600. struct page **unmap_pages)
  601. {
  602. unsigned int i, invcount = 0;
  603. for (i = 0; i < num; i++) {
  604. if (pages[i]->persistent_gnt != NULL) {
  605. put_persistent_gnt(ring, pages[i]->persistent_gnt);
  606. continue;
  607. }
  608. if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
  609. continue;
  610. unmap_pages[invcount] = pages[i]->page;
  611. gnttab_set_unmap_op(&unmap_ops[invcount], vaddr(pages[i]->page),
  612. GNTMAP_host_map, pages[i]->handle);
  613. pages[i]->handle = BLKBACK_INVALID_HANDLE;
  614. invcount++;
  615. }
  616. return invcount;
  617. }
  618. static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
  619. {
  620. struct pending_req *pending_req = (struct pending_req *)(data->data);
  621. struct xen_blkif_ring *ring = pending_req->ring;
  622. struct xen_blkif *blkif = ring->blkif;
  623. /* BUG_ON used to reproduce existing behaviour,
  624. but is this the best way to deal with this? */
  625. BUG_ON(result);
  626. put_free_pages(ring, data->pages, data->count);
  627. make_response(ring, pending_req->id,
  628. pending_req->operation, pending_req->status);
  629. free_req(ring, pending_req);
  630. /*
  631. * Make sure the request is freed before releasing blkif,
  632. * or there could be a race between free_req and the
  633. * cleanup done in xen_blkif_free during shutdown.
  634. *
  635. * NB: The fact that we might try to wake up pending_free_wq
  636. * before drain_complete (in case there's a drain going on)
  637. * it's not a problem with our current implementation
  638. * because we can assure there's no thread waiting on
  639. * pending_free_wq if there's a drain going on, but it has
  640. * to be taken into account if the current model is changed.
  641. */
  642. if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) {
  643. complete(&blkif->drain_complete);
  644. }
  645. xen_blkif_put(blkif);
  646. }
  647. static void xen_blkbk_unmap_and_respond(struct pending_req *req)
  648. {
  649. struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
  650. struct xen_blkif_ring *ring = req->ring;
  651. struct grant_page **pages = req->segments;
  652. unsigned int invcount;
  653. invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs,
  654. req->unmap, req->unmap_pages);
  655. work->data = req;
  656. work->done = xen_blkbk_unmap_and_respond_callback;
  657. work->unmap_ops = req->unmap;
  658. work->kunmap_ops = NULL;
  659. work->pages = req->unmap_pages;
  660. work->count = invcount;
  661. gnttab_unmap_refs_async(&req->gnttab_unmap_data);
  662. }
  663. /*
  664. * Unmap the grant references.
  665. *
  666. * This could accumulate ops up to the batch size to reduce the number
  667. * of hypercalls, but since this is only used in error paths there's
  668. * no real need.
  669. */
  670. static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
  671. struct grant_page *pages[],
  672. int num)
  673. {
  674. struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  675. struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  676. unsigned int invcount = 0;
  677. int ret;
  678. while (num) {
  679. unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
  680. invcount = xen_blkbk_unmap_prepare(ring, pages, batch,
  681. unmap, unmap_pages);
  682. if (invcount) {
  683. ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
  684. BUG_ON(ret);
  685. put_free_pages(ring, unmap_pages, invcount);
  686. }
  687. pages += batch;
  688. num -= batch;
  689. }
  690. }
  691. static int xen_blkbk_map(struct xen_blkif_ring *ring,
  692. struct grant_page *pages[],
  693. int num, bool ro)
  694. {
  695. struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  696. struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  697. struct persistent_gnt *persistent_gnt = NULL;
  698. phys_addr_t addr = 0;
  699. int i, seg_idx, new_map_idx;
  700. int segs_to_map = 0;
  701. int ret = 0;
  702. int last_map = 0, map_until = 0;
  703. int use_persistent_gnts;
  704. struct xen_blkif *blkif = ring->blkif;
  705. use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
  706. /*
  707. * Fill out preq.nr_sects with proper amount of sectors, and setup
  708. * assign map[..] with the PFN of the page in our domain with the
  709. * corresponding grant reference for each page.
  710. */
  711. again:
  712. for (i = map_until; i < num; i++) {
  713. uint32_t flags;
  714. if (use_persistent_gnts) {
  715. persistent_gnt = get_persistent_gnt(
  716. ring,
  717. pages[i]->gref);
  718. }
  719. if (persistent_gnt) {
  720. /*
  721. * We are using persistent grants and
  722. * the grant is already mapped
  723. */
  724. pages[i]->page = persistent_gnt->page;
  725. pages[i]->persistent_gnt = persistent_gnt;
  726. } else {
  727. if (get_free_page(ring, &pages[i]->page))
  728. goto out_of_memory;
  729. addr = vaddr(pages[i]->page);
  730. pages_to_gnt[segs_to_map] = pages[i]->page;
  731. pages[i]->persistent_gnt = NULL;
  732. flags = GNTMAP_host_map;
  733. if (!use_persistent_gnts && ro)
  734. flags |= GNTMAP_readonly;
  735. gnttab_set_map_op(&map[segs_to_map++], addr,
  736. flags, pages[i]->gref,
  737. blkif->domid);
  738. }
  739. map_until = i + 1;
  740. if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
  741. break;
  742. }
  743. if (segs_to_map) {
  744. ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
  745. BUG_ON(ret);
  746. }
  747. /*
  748. * Now swizzle the MFN in our domain with the MFN from the other domain
  749. * so that when we access vaddr(pending_req,i) it has the contents of
  750. * the page from the other domain.
  751. */
  752. for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) {
  753. if (!pages[seg_idx]->persistent_gnt) {
  754. /* This is a newly mapped grant */
  755. BUG_ON(new_map_idx >= segs_to_map);
  756. if (unlikely(map[new_map_idx].status != 0)) {
  757. pr_debug("invalid buffer -- could not remap it\n");
  758. put_free_pages(ring, &pages[seg_idx]->page, 1);
  759. pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
  760. ret |= 1;
  761. goto next;
  762. }
  763. pages[seg_idx]->handle = map[new_map_idx].handle;
  764. } else {
  765. continue;
  766. }
  767. if (use_persistent_gnts &&
  768. ring->persistent_gnt_c < xen_blkif_max_pgrants) {
  769. /*
  770. * We are using persistent grants, the grant is
  771. * not mapped but we might have room for it.
  772. */
  773. persistent_gnt = kmalloc(sizeof(struct persistent_gnt),
  774. GFP_KERNEL);
  775. if (!persistent_gnt) {
  776. /*
  777. * If we don't have enough memory to
  778. * allocate the persistent_gnt struct
  779. * map this grant non-persistenly
  780. */
  781. goto next;
  782. }
  783. persistent_gnt->gnt = map[new_map_idx].ref;
  784. persistent_gnt->handle = map[new_map_idx].handle;
  785. persistent_gnt->page = pages[seg_idx]->page;
  786. if (add_persistent_gnt(ring,
  787. persistent_gnt)) {
  788. kfree(persistent_gnt);
  789. persistent_gnt = NULL;
  790. goto next;
  791. }
  792. pages[seg_idx]->persistent_gnt = persistent_gnt;
  793. pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
  794. persistent_gnt->gnt, ring->persistent_gnt_c,
  795. xen_blkif_max_pgrants);
  796. goto next;
  797. }
  798. if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
  799. blkif->vbd.overflow_max_grants = 1;
  800. pr_debug("domain %u, device %#x is using maximum number of persistent grants\n",
  801. blkif->domid, blkif->vbd.handle);
  802. }
  803. /*
  804. * We could not map this grant persistently, so use it as
  805. * a non-persistent grant.
  806. */
  807. next:
  808. new_map_idx++;
  809. }
  810. segs_to_map = 0;
  811. last_map = map_until;
  812. if (map_until != num)
  813. goto again;
  814. return ret;
  815. out_of_memory:
  816. pr_alert("%s: out of memory\n", __func__);
  817. put_free_pages(ring, pages_to_gnt, segs_to_map);
  818. return -ENOMEM;
  819. }
  820. static int xen_blkbk_map_seg(struct pending_req *pending_req)
  821. {
  822. int rc;
  823. rc = xen_blkbk_map(pending_req->ring, pending_req->segments,
  824. pending_req->nr_segs,
  825. (pending_req->operation != BLKIF_OP_READ));
  826. return rc;
  827. }
  828. static int xen_blkbk_parse_indirect(struct blkif_request *req,
  829. struct pending_req *pending_req,
  830. struct seg_buf seg[],
  831. struct phys_req *preq)
  832. {
  833. struct grant_page **pages = pending_req->indirect_pages;
  834. struct xen_blkif_ring *ring = pending_req->ring;
  835. int indirect_grefs, rc, n, nseg, i;
  836. struct blkif_request_segment *segments = NULL;
  837. nseg = pending_req->nr_segs;
  838. indirect_grefs = INDIRECT_PAGES(nseg);
  839. BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
  840. for (i = 0; i < indirect_grefs; i++)
  841. pages[i]->gref = req->u.indirect.indirect_grefs[i];
  842. rc = xen_blkbk_map(ring, pages, indirect_grefs, true);
  843. if (rc)
  844. goto unmap;
  845. for (n = 0, i = 0; n < nseg; n++) {
  846. uint8_t first_sect, last_sect;
  847. if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
  848. /* Map indirect segments */
  849. if (segments)
  850. kunmap_atomic(segments);
  851. segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
  852. }
  853. i = n % SEGS_PER_INDIRECT_FRAME;
  854. pending_req->segments[n]->gref = segments[i].gref;
  855. first_sect = READ_ONCE(segments[i].first_sect);
  856. last_sect = READ_ONCE(segments[i].last_sect);
  857. if (last_sect >= (XEN_PAGE_SIZE >> 9) || last_sect < first_sect) {
  858. rc = -EINVAL;
  859. goto unmap;
  860. }
  861. seg[n].nsec = last_sect - first_sect + 1;
  862. seg[n].offset = first_sect << 9;
  863. preq->nr_sects += seg[n].nsec;
  864. }
  865. unmap:
  866. if (segments)
  867. kunmap_atomic(segments);
  868. xen_blkbk_unmap(ring, pages, indirect_grefs);
  869. return rc;
  870. }
  871. static int dispatch_discard_io(struct xen_blkif_ring *ring,
  872. struct blkif_request *req)
  873. {
  874. int err = 0;
  875. int status = BLKIF_RSP_OKAY;
  876. struct xen_blkif *blkif = ring->blkif;
  877. struct block_device *bdev = blkif->vbd.bdev;
  878. unsigned long secure;
  879. struct phys_req preq;
  880. xen_blkif_get(blkif);
  881. preq.sector_number = req->u.discard.sector_number;
  882. preq.nr_sects = req->u.discard.nr_sectors;
  883. err = xen_vbd_translate(&preq, blkif, REQ_OP_WRITE);
  884. if (err) {
  885. pr_warn("access denied: DISCARD [%llu->%llu] on dev=%04x\n",
  886. preq.sector_number,
  887. preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
  888. goto fail_response;
  889. }
  890. ring->st_ds_req++;
  891. secure = (blkif->vbd.discard_secure &&
  892. (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
  893. BLKDEV_DISCARD_SECURE : 0;
  894. err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
  895. req->u.discard.nr_sectors,
  896. GFP_KERNEL, secure);
  897. fail_response:
  898. if (err == -EOPNOTSUPP) {
  899. pr_debug("discard op failed, not supported\n");
  900. status = BLKIF_RSP_EOPNOTSUPP;
  901. } else if (err)
  902. status = BLKIF_RSP_ERROR;
  903. make_response(ring, req->u.discard.id, req->operation, status);
  904. xen_blkif_put(blkif);
  905. return err;
  906. }
  907. static int dispatch_other_io(struct xen_blkif_ring *ring,
  908. struct blkif_request *req,
  909. struct pending_req *pending_req)
  910. {
  911. free_req(ring, pending_req);
  912. make_response(ring, req->u.other.id, req->operation,
  913. BLKIF_RSP_EOPNOTSUPP);
  914. return -EIO;
  915. }
  916. static void xen_blk_drain_io(struct xen_blkif_ring *ring)
  917. {
  918. struct xen_blkif *blkif = ring->blkif;
  919. atomic_set(&blkif->drain, 1);
  920. do {
  921. if (atomic_read(&ring->inflight) == 0)
  922. break;
  923. wait_for_completion_interruptible_timeout(
  924. &blkif->drain_complete, HZ);
  925. if (!atomic_read(&blkif->drain))
  926. break;
  927. } while (!kthread_should_stop());
  928. atomic_set(&blkif->drain, 0);
  929. }
  930. /*
  931. * Completion callback on the bio's. Called as bh->b_end_io()
  932. */
  933. static void __end_block_io_op(struct pending_req *pending_req, int error)
  934. {
  935. /* An error fails the entire request. */
  936. if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
  937. (error == -EOPNOTSUPP)) {
  938. pr_debug("flush diskcache op failed, not supported\n");
  939. xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
  940. pending_req->status = BLKIF_RSP_EOPNOTSUPP;
  941. } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
  942. (error == -EOPNOTSUPP)) {
  943. pr_debug("write barrier op failed, not supported\n");
  944. xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
  945. pending_req->status = BLKIF_RSP_EOPNOTSUPP;
  946. } else if (error) {
  947. pr_debug("Buffer not up-to-date at end of operation,"
  948. " error=%d\n", error);
  949. pending_req->status = BLKIF_RSP_ERROR;
  950. }
  951. /*
  952. * If all of the bio's have completed it is time to unmap
  953. * the grant references associated with 'request' and provide
  954. * the proper response on the ring.
  955. */
  956. if (atomic_dec_and_test(&pending_req->pendcnt))
  957. xen_blkbk_unmap_and_respond(pending_req);
  958. }
  959. /*
  960. * bio callback.
  961. */
  962. static void end_block_io_op(struct bio *bio)
  963. {
  964. __end_block_io_op(bio->bi_private, bio->bi_error);
  965. bio_put(bio);
  966. }
  967. /*
  968. * Function to copy the from the ring buffer the 'struct blkif_request'
  969. * (which has the sectors we want, number of them, grant references, etc),
  970. * and transmute it to the block API to hand it over to the proper block disk.
  971. */
  972. static int
  973. __do_block_io_op(struct xen_blkif_ring *ring)
  974. {
  975. union blkif_back_rings *blk_rings = &ring->blk_rings;
  976. struct blkif_request req;
  977. struct pending_req *pending_req;
  978. RING_IDX rc, rp;
  979. int more_to_do = 0;
  980. rc = blk_rings->common.req_cons;
  981. rp = blk_rings->common.sring->req_prod;
  982. rmb(); /* Ensure we see queued requests up to 'rp'. */
  983. if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
  984. rc = blk_rings->common.rsp_prod_pvt;
  985. pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
  986. rp, rc, rp - rc, ring->blkif->vbd.pdevice);
  987. return -EACCES;
  988. }
  989. while (rc != rp) {
  990. if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
  991. break;
  992. if (kthread_should_stop()) {
  993. more_to_do = 1;
  994. break;
  995. }
  996. pending_req = alloc_req(ring);
  997. if (NULL == pending_req) {
  998. ring->st_oo_req++;
  999. more_to_do = 1;
  1000. break;
  1001. }
  1002. switch (ring->blkif->blk_protocol) {
  1003. case BLKIF_PROTOCOL_NATIVE:
  1004. memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
  1005. break;
  1006. case BLKIF_PROTOCOL_X86_32:
  1007. blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
  1008. break;
  1009. case BLKIF_PROTOCOL_X86_64:
  1010. blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
  1011. break;
  1012. default:
  1013. BUG();
  1014. }
  1015. blk_rings->common.req_cons = ++rc; /* before make_response() */
  1016. /* Apply all sanity checks to /private copy/ of request. */
  1017. barrier();
  1018. switch (req.operation) {
  1019. case BLKIF_OP_READ:
  1020. case BLKIF_OP_WRITE:
  1021. case BLKIF_OP_WRITE_BARRIER:
  1022. case BLKIF_OP_FLUSH_DISKCACHE:
  1023. case BLKIF_OP_INDIRECT:
  1024. if (dispatch_rw_block_io(ring, &req, pending_req))
  1025. goto done;
  1026. break;
  1027. case BLKIF_OP_DISCARD:
  1028. free_req(ring, pending_req);
  1029. if (dispatch_discard_io(ring, &req))
  1030. goto done;
  1031. break;
  1032. default:
  1033. if (dispatch_other_io(ring, &req, pending_req))
  1034. goto done;
  1035. break;
  1036. }
  1037. /* Yield point for this unbounded loop. */
  1038. cond_resched();
  1039. }
  1040. done:
  1041. return more_to_do;
  1042. }
  1043. static int
  1044. do_block_io_op(struct xen_blkif_ring *ring)
  1045. {
  1046. union blkif_back_rings *blk_rings = &ring->blk_rings;
  1047. int more_to_do;
  1048. do {
  1049. more_to_do = __do_block_io_op(ring);
  1050. if (more_to_do)
  1051. break;
  1052. RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
  1053. } while (more_to_do);
  1054. return more_to_do;
  1055. }
  1056. /*
  1057. * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
  1058. * and call the 'submit_bio' to pass it to the underlying storage.
  1059. */
  1060. static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
  1061. struct blkif_request *req,
  1062. struct pending_req *pending_req)
  1063. {
  1064. struct phys_req preq;
  1065. struct seg_buf *seg = pending_req->seg;
  1066. unsigned int nseg;
  1067. struct bio *bio = NULL;
  1068. struct bio **biolist = pending_req->biolist;
  1069. int i, nbio = 0;
  1070. int operation;
  1071. int operation_flags = 0;
  1072. struct blk_plug plug;
  1073. bool drain = false;
  1074. struct grant_page **pages = pending_req->segments;
  1075. unsigned short req_operation;
  1076. req_operation = req->operation == BLKIF_OP_INDIRECT ?
  1077. req->u.indirect.indirect_op : req->operation;
  1078. if ((req->operation == BLKIF_OP_INDIRECT) &&
  1079. (req_operation != BLKIF_OP_READ) &&
  1080. (req_operation != BLKIF_OP_WRITE)) {
  1081. pr_debug("Invalid indirect operation (%u)\n", req_operation);
  1082. goto fail_response;
  1083. }
  1084. switch (req_operation) {
  1085. case BLKIF_OP_READ:
  1086. ring->st_rd_req++;
  1087. operation = REQ_OP_READ;
  1088. break;
  1089. case BLKIF_OP_WRITE:
  1090. ring->st_wr_req++;
  1091. operation = REQ_OP_WRITE;
  1092. operation_flags = REQ_SYNC | REQ_IDLE;
  1093. break;
  1094. case BLKIF_OP_WRITE_BARRIER:
  1095. drain = true;
  1096. case BLKIF_OP_FLUSH_DISKCACHE:
  1097. ring->st_f_req++;
  1098. operation = REQ_OP_WRITE;
  1099. operation_flags = REQ_PREFLUSH;
  1100. break;
  1101. default:
  1102. operation = 0; /* make gcc happy */
  1103. goto fail_response;
  1104. break;
  1105. }
  1106. /* Check that the number of segments is sane. */
  1107. nseg = req->operation == BLKIF_OP_INDIRECT ?
  1108. req->u.indirect.nr_segments : req->u.rw.nr_segments;
  1109. if (unlikely(nseg == 0 && operation_flags != REQ_PREFLUSH) ||
  1110. unlikely((req->operation != BLKIF_OP_INDIRECT) &&
  1111. (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
  1112. unlikely((req->operation == BLKIF_OP_INDIRECT) &&
  1113. (nseg > MAX_INDIRECT_SEGMENTS))) {
  1114. pr_debug("Bad number of segments in request (%d)\n", nseg);
  1115. /* Haven't submitted any bio's yet. */
  1116. goto fail_response;
  1117. }
  1118. preq.nr_sects = 0;
  1119. pending_req->ring = ring;
  1120. pending_req->id = req->u.rw.id;
  1121. pending_req->operation = req_operation;
  1122. pending_req->status = BLKIF_RSP_OKAY;
  1123. pending_req->nr_segs = nseg;
  1124. if (req->operation != BLKIF_OP_INDIRECT) {
  1125. preq.dev = req->u.rw.handle;
  1126. preq.sector_number = req->u.rw.sector_number;
  1127. for (i = 0; i < nseg; i++) {
  1128. pages[i]->gref = req->u.rw.seg[i].gref;
  1129. seg[i].nsec = req->u.rw.seg[i].last_sect -
  1130. req->u.rw.seg[i].first_sect + 1;
  1131. seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
  1132. if ((req->u.rw.seg[i].last_sect >= (XEN_PAGE_SIZE >> 9)) ||
  1133. (req->u.rw.seg[i].last_sect <
  1134. req->u.rw.seg[i].first_sect))
  1135. goto fail_response;
  1136. preq.nr_sects += seg[i].nsec;
  1137. }
  1138. } else {
  1139. preq.dev = req->u.indirect.handle;
  1140. preq.sector_number = req->u.indirect.sector_number;
  1141. if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
  1142. goto fail_response;
  1143. }
  1144. if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
  1145. pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
  1146. operation == REQ_OP_READ ? "read" : "write",
  1147. preq.sector_number,
  1148. preq.sector_number + preq.nr_sects,
  1149. ring->blkif->vbd.pdevice);
  1150. goto fail_response;
  1151. }
  1152. /*
  1153. * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
  1154. * is set there.
  1155. */
  1156. for (i = 0; i < nseg; i++) {
  1157. if (((int)preq.sector_number|(int)seg[i].nsec) &
  1158. ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
  1159. pr_debug("Misaligned I/O request from domain %d\n",
  1160. ring->blkif->domid);
  1161. goto fail_response;
  1162. }
  1163. }
  1164. /* Wait on all outstanding I/O's and once that has been completed
  1165. * issue the flush.
  1166. */
  1167. if (drain)
  1168. xen_blk_drain_io(pending_req->ring);
  1169. /*
  1170. * If we have failed at this point, we need to undo the M2P override,
  1171. * set gnttab_set_unmap_op on all of the grant references and perform
  1172. * the hypercall to unmap the grants - that is all done in
  1173. * xen_blkbk_unmap.
  1174. */
  1175. if (xen_blkbk_map_seg(pending_req))
  1176. goto fail_flush;
  1177. /*
  1178. * This corresponding xen_blkif_put is done in __end_block_io_op, or
  1179. * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
  1180. */
  1181. xen_blkif_get(ring->blkif);
  1182. atomic_inc(&ring->inflight);
  1183. for (i = 0; i < nseg; i++) {
  1184. while ((bio == NULL) ||
  1185. (bio_add_page(bio,
  1186. pages[i]->page,
  1187. seg[i].nsec << 9,
  1188. seg[i].offset) == 0)) {
  1189. int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES);
  1190. bio = bio_alloc(GFP_KERNEL, nr_iovecs);
  1191. if (unlikely(bio == NULL))
  1192. goto fail_put_bio;
  1193. biolist[nbio++] = bio;
  1194. bio->bi_bdev = preq.bdev;
  1195. bio->bi_private = pending_req;
  1196. bio->bi_end_io = end_block_io_op;
  1197. bio->bi_iter.bi_sector = preq.sector_number;
  1198. bio_set_op_attrs(bio, operation, operation_flags);
  1199. }
  1200. preq.sector_number += seg[i].nsec;
  1201. }
  1202. /* This will be hit if the operation was a flush or discard. */
  1203. if (!bio) {
  1204. BUG_ON(operation_flags != REQ_PREFLUSH);
  1205. bio = bio_alloc(GFP_KERNEL, 0);
  1206. if (unlikely(bio == NULL))
  1207. goto fail_put_bio;
  1208. biolist[nbio++] = bio;
  1209. bio->bi_bdev = preq.bdev;
  1210. bio->bi_private = pending_req;
  1211. bio->bi_end_io = end_block_io_op;
  1212. bio_set_op_attrs(bio, operation, operation_flags);
  1213. }
  1214. atomic_set(&pending_req->pendcnt, nbio);
  1215. blk_start_plug(&plug);
  1216. for (i = 0; i < nbio; i++)
  1217. submit_bio(biolist[i]);
  1218. /* Let the I/Os go.. */
  1219. blk_finish_plug(&plug);
  1220. if (operation == REQ_OP_READ)
  1221. ring->st_rd_sect += preq.nr_sects;
  1222. else if (operation == REQ_OP_WRITE)
  1223. ring->st_wr_sect += preq.nr_sects;
  1224. return 0;
  1225. fail_flush:
  1226. xen_blkbk_unmap(ring, pending_req->segments,
  1227. pending_req->nr_segs);
  1228. fail_response:
  1229. /* Haven't submitted any bio's yet. */
  1230. make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
  1231. free_req(ring, pending_req);
  1232. msleep(1); /* back off a bit */
  1233. return -EIO;
  1234. fail_put_bio:
  1235. for (i = 0; i < nbio; i++)
  1236. bio_put(biolist[i]);
  1237. atomic_set(&pending_req->pendcnt, 1);
  1238. __end_block_io_op(pending_req, -EINVAL);
  1239. msleep(1); /* back off a bit */
  1240. return -EIO;
  1241. }
  1242. /*
  1243. * Put a response on the ring on how the operation fared.
  1244. */
  1245. static void make_response(struct xen_blkif_ring *ring, u64 id,
  1246. unsigned short op, int st)
  1247. {
  1248. struct blkif_response resp;
  1249. unsigned long flags;
  1250. union blkif_back_rings *blk_rings;
  1251. int notify;
  1252. resp.id = id;
  1253. resp.operation = op;
  1254. resp.status = st;
  1255. spin_lock_irqsave(&ring->blk_ring_lock, flags);
  1256. blk_rings = &ring->blk_rings;
  1257. /* Place on the response ring for the relevant domain. */
  1258. switch (ring->blkif->blk_protocol) {
  1259. case BLKIF_PROTOCOL_NATIVE:
  1260. memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
  1261. &resp, sizeof(resp));
  1262. break;
  1263. case BLKIF_PROTOCOL_X86_32:
  1264. memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
  1265. &resp, sizeof(resp));
  1266. break;
  1267. case BLKIF_PROTOCOL_X86_64:
  1268. memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
  1269. &resp, sizeof(resp));
  1270. break;
  1271. default:
  1272. BUG();
  1273. }
  1274. blk_rings->common.rsp_prod_pvt++;
  1275. RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
  1276. spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
  1277. if (notify)
  1278. notify_remote_via_irq(ring->irq);
  1279. }
  1280. static int __init xen_blkif_init(void)
  1281. {
  1282. int rc = 0;
  1283. if (!xen_domain())
  1284. return -ENODEV;
  1285. if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
  1286. pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
  1287. xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
  1288. xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
  1289. }
  1290. if (xenblk_max_queues == 0)
  1291. xenblk_max_queues = num_online_cpus();
  1292. rc = xen_blkif_interface_init();
  1293. if (rc)
  1294. goto failed_init;
  1295. rc = xen_blkif_xenbus_init();
  1296. if (rc)
  1297. goto failed_init;
  1298. failed_init:
  1299. return rc;
  1300. }
  1301. module_init(xen_blkif_init);
  1302. MODULE_LICENSE("Dual BSD/GPL");
  1303. MODULE_ALIAS("xen-backend:vbd");