xfs_aops.c 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520
  1. /*
  2. * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  3. * All Rights Reserved.
  4. *
  5. * This program is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU General Public License as
  7. * published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it would be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write the Free Software Foundation,
  16. * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  17. */
  18. #include "xfs.h"
  19. #include "xfs_bit.h"
  20. #include "xfs_log.h"
  21. #include "xfs_inum.h"
  22. #include "xfs_sb.h"
  23. #include "xfs_ag.h"
  24. #include "xfs_trans.h"
  25. #include "xfs_mount.h"
  26. #include "xfs_bmap_btree.h"
  27. #include "xfs_dinode.h"
  28. #include "xfs_inode.h"
  29. #include "xfs_inode_item.h"
  30. #include "xfs_alloc.h"
  31. #include "xfs_error.h"
  32. #include "xfs_rw.h"
  33. #include "xfs_iomap.h"
  34. #include "xfs_vnodeops.h"
  35. #include "xfs_trace.h"
  36. #include "xfs_bmap.h"
  37. #include <linux/gfp.h>
  38. #include <linux/mpage.h>
  39. #include <linux/pagevec.h>
  40. #include <linux/writeback.h>
  41. void
  42. xfs_count_page_state(
  43. struct page *page,
  44. int *delalloc,
  45. int *unwritten)
  46. {
  47. struct buffer_head *bh, *head;
  48. *delalloc = *unwritten = 0;
  49. bh = head = page_buffers(page);
  50. do {
  51. if (buffer_unwritten(bh))
  52. (*unwritten) = 1;
  53. else if (buffer_delay(bh))
  54. (*delalloc) = 1;
  55. } while ((bh = bh->b_this_page) != head);
  56. }
  57. STATIC struct block_device *
  58. xfs_find_bdev_for_inode(
  59. struct inode *inode)
  60. {
  61. struct xfs_inode *ip = XFS_I(inode);
  62. struct xfs_mount *mp = ip->i_mount;
  63. if (XFS_IS_REALTIME_INODE(ip))
  64. return mp->m_rtdev_targp->bt_bdev;
  65. else
  66. return mp->m_ddev_targp->bt_bdev;
  67. }
  68. /*
  69. * We're now finished for good with this ioend structure.
  70. * Update the page state via the associated buffer_heads,
  71. * release holds on the inode and bio, and finally free
  72. * up memory. Do not use the ioend after this.
  73. */
  74. STATIC void
  75. xfs_destroy_ioend(
  76. xfs_ioend_t *ioend)
  77. {
  78. struct buffer_head *bh, *next;
  79. for (bh = ioend->io_buffer_head; bh; bh = next) {
  80. next = bh->b_private;
  81. bh->b_end_io(bh, !ioend->io_error);
  82. }
  83. if (ioend->io_iocb) {
  84. inode_dio_done(ioend->io_inode);
  85. if (ioend->io_isasync) {
  86. aio_complete(ioend->io_iocb, ioend->io_error ?
  87. ioend->io_error : ioend->io_result, 0);
  88. }
  89. }
  90. mempool_free(ioend, xfs_ioend_pool);
  91. }
  92. /*
  93. * Fast and loose check if this write could update the on-disk inode size.
  94. */
  95. static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
  96. {
  97. return ioend->io_offset + ioend->io_size >
  98. XFS_I(ioend->io_inode)->i_d.di_size;
  99. }
  100. STATIC int
  101. xfs_setfilesize_trans_alloc(
  102. struct xfs_ioend *ioend)
  103. {
  104. struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
  105. struct xfs_trans *tp;
  106. int error;
  107. tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
  108. error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
  109. if (error) {
  110. xfs_trans_cancel(tp, 0);
  111. return error;
  112. }
  113. ioend->io_append_trans = tp;
  114. /*
  115. * We hand off the transaction to the completion thread now, so
  116. * clear the flag here.
  117. */
  118. current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
  119. return 0;
  120. }
  121. /*
  122. * Update on-disk file size now that data has been written to disk.
  123. */
  124. STATIC int
  125. xfs_setfilesize(
  126. struct xfs_ioend *ioend)
  127. {
  128. struct xfs_inode *ip = XFS_I(ioend->io_inode);
  129. struct xfs_trans *tp = ioend->io_append_trans;
  130. xfs_fsize_t isize;
  131. /*
  132. * The transaction was allocated in the I/O submission thread,
  133. * thus we need to mark ourselves as beeing in a transaction
  134. * manually.
  135. */
  136. current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
  137. xfs_ilock(ip, XFS_ILOCK_EXCL);
  138. isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
  139. if (!isize) {
  140. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  141. xfs_trans_cancel(tp, 0);
  142. return 0;
  143. }
  144. trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
  145. ip->i_d.di_size = isize;
  146. xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
  147. xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  148. return xfs_trans_commit(tp, 0);
  149. }
  150. /*
  151. * Schedule IO completion handling on the final put of an ioend.
  152. *
  153. * If there is no work to do we might as well call it a day and free the
  154. * ioend right now.
  155. */
  156. STATIC void
  157. xfs_finish_ioend(
  158. struct xfs_ioend *ioend)
  159. {
  160. if (atomic_dec_and_test(&ioend->io_remaining)) {
  161. struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
  162. if (ioend->io_type == IO_UNWRITTEN)
  163. queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
  164. else if (ioend->io_append_trans)
  165. queue_work(mp->m_data_workqueue, &ioend->io_work);
  166. else
  167. xfs_destroy_ioend(ioend);
  168. }
  169. }
  170. /*
  171. * IO write completion.
  172. */
  173. STATIC void
  174. xfs_end_io(
  175. struct work_struct *work)
  176. {
  177. xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);
  178. struct xfs_inode *ip = XFS_I(ioend->io_inode);
  179. int error = 0;
  180. if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
  181. ioend->io_error = -EIO;
  182. goto done;
  183. }
  184. if (ioend->io_error)
  185. goto done;
  186. /*
  187. * For unwritten extents we need to issue transactions to convert a
  188. * range to normal written extens after the data I/O has finished.
  189. */
  190. if (ioend->io_type == IO_UNWRITTEN) {
  191. /*
  192. * For buffered I/O we never preallocate a transaction when
  193. * doing the unwritten extent conversion, but for direct I/O
  194. * we do not know if we are converting an unwritten extent
  195. * or not at the point where we preallocate the transaction.
  196. */
  197. if (ioend->io_append_trans) {
  198. ASSERT(ioend->io_isdirect);
  199. current_set_flags_nested(
  200. &ioend->io_append_trans->t_pflags, PF_FSTRANS);
  201. xfs_trans_cancel(ioend->io_append_trans, 0);
  202. }
  203. error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
  204. ioend->io_size);
  205. if (error) {
  206. ioend->io_error = -error;
  207. goto done;
  208. }
  209. } else if (ioend->io_append_trans) {
  210. error = xfs_setfilesize(ioend);
  211. if (error)
  212. ioend->io_error = -error;
  213. } else {
  214. ASSERT(!xfs_ioend_is_append(ioend));
  215. }
  216. done:
  217. xfs_destroy_ioend(ioend);
  218. }
  219. /*
  220. * Call IO completion handling in caller context on the final put of an ioend.
  221. */
  222. STATIC void
  223. xfs_finish_ioend_sync(
  224. struct xfs_ioend *ioend)
  225. {
  226. if (atomic_dec_and_test(&ioend->io_remaining))
  227. xfs_end_io(&ioend->io_work);
  228. }
  229. /*
  230. * Allocate and initialise an IO completion structure.
  231. * We need to track unwritten extent write completion here initially.
  232. * We'll need to extend this for updating the ondisk inode size later
  233. * (vs. incore size).
  234. */
  235. STATIC xfs_ioend_t *
  236. xfs_alloc_ioend(
  237. struct inode *inode,
  238. unsigned int type)
  239. {
  240. xfs_ioend_t *ioend;
  241. ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
  242. /*
  243. * Set the count to 1 initially, which will prevent an I/O
  244. * completion callback from happening before we have started
  245. * all the I/O from calling the completion routine too early.
  246. */
  247. atomic_set(&ioend->io_remaining, 1);
  248. ioend->io_isasync = 0;
  249. ioend->io_isdirect = 0;
  250. ioend->io_error = 0;
  251. ioend->io_list = NULL;
  252. ioend->io_type = type;
  253. ioend->io_inode = inode;
  254. ioend->io_buffer_head = NULL;
  255. ioend->io_buffer_tail = NULL;
  256. ioend->io_offset = 0;
  257. ioend->io_size = 0;
  258. ioend->io_iocb = NULL;
  259. ioend->io_result = 0;
  260. ioend->io_append_trans = NULL;
  261. INIT_WORK(&ioend->io_work, xfs_end_io);
  262. return ioend;
  263. }
  264. STATIC int
  265. xfs_map_blocks(
  266. struct inode *inode,
  267. loff_t offset,
  268. struct xfs_bmbt_irec *imap,
  269. int type,
  270. int nonblocking)
  271. {
  272. struct xfs_inode *ip = XFS_I(inode);
  273. struct xfs_mount *mp = ip->i_mount;
  274. ssize_t count = 1 << inode->i_blkbits;
  275. xfs_fileoff_t offset_fsb, end_fsb;
  276. int error = 0;
  277. int bmapi_flags = XFS_BMAPI_ENTIRE;
  278. int nimaps = 1;
  279. if (XFS_FORCED_SHUTDOWN(mp))
  280. return -XFS_ERROR(EIO);
  281. if (type == IO_UNWRITTEN)
  282. bmapi_flags |= XFS_BMAPI_IGSTATE;
  283. if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
  284. if (nonblocking)
  285. return -XFS_ERROR(EAGAIN);
  286. xfs_ilock(ip, XFS_ILOCK_SHARED);
  287. }
  288. ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
  289. (ip->i_df.if_flags & XFS_IFEXTENTS));
  290. ASSERT(offset <= mp->m_maxioffset);
  291. if (offset + count > mp->m_maxioffset)
  292. count = mp->m_maxioffset - offset;
  293. end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
  294. offset_fsb = XFS_B_TO_FSBT(mp, offset);
  295. error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
  296. imap, &nimaps, bmapi_flags);
  297. xfs_iunlock(ip, XFS_ILOCK_SHARED);
  298. if (error)
  299. return -XFS_ERROR(error);
  300. if (type == IO_DELALLOC &&
  301. (!nimaps || isnullstartblock(imap->br_startblock))) {
  302. error = xfs_iomap_write_allocate(ip, offset, count, imap);
  303. if (!error)
  304. trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
  305. return -XFS_ERROR(error);
  306. }
  307. #ifdef DEBUG
  308. if (type == IO_UNWRITTEN) {
  309. ASSERT(nimaps);
  310. ASSERT(imap->br_startblock != HOLESTARTBLOCK);
  311. ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
  312. }
  313. #endif
  314. if (nimaps)
  315. trace_xfs_map_blocks_found(ip, offset, count, type, imap);
  316. return 0;
  317. }
  318. STATIC int
  319. xfs_imap_valid(
  320. struct inode *inode,
  321. struct xfs_bmbt_irec *imap,
  322. xfs_off_t offset)
  323. {
  324. offset >>= inode->i_blkbits;
  325. return offset >= imap->br_startoff &&
  326. offset < imap->br_startoff + imap->br_blockcount;
  327. }
  328. /*
  329. * BIO completion handler for buffered IO.
  330. */
  331. STATIC void
  332. xfs_end_bio(
  333. struct bio *bio,
  334. int error)
  335. {
  336. xfs_ioend_t *ioend = bio->bi_private;
  337. ASSERT(atomic_read(&bio->bi_cnt) >= 1);
  338. ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
  339. /* Toss bio and pass work off to an xfsdatad thread */
  340. bio->bi_private = NULL;
  341. bio->bi_end_io = NULL;
  342. bio_put(bio);
  343. xfs_finish_ioend(ioend);
  344. }
  345. STATIC void
  346. xfs_submit_ioend_bio(
  347. struct writeback_control *wbc,
  348. xfs_ioend_t *ioend,
  349. struct bio *bio)
  350. {
  351. atomic_inc(&ioend->io_remaining);
  352. bio->bi_private = ioend;
  353. bio->bi_end_io = xfs_end_bio;
  354. submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
  355. }
  356. STATIC struct bio *
  357. xfs_alloc_ioend_bio(
  358. struct buffer_head *bh)
  359. {
  360. int nvecs = bio_get_nr_vecs(bh->b_bdev);
  361. struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
  362. ASSERT(bio->bi_private == NULL);
  363. bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
  364. bio->bi_bdev = bh->b_bdev;
  365. return bio;
  366. }
  367. STATIC void
  368. xfs_start_buffer_writeback(
  369. struct buffer_head *bh)
  370. {
  371. ASSERT(buffer_mapped(bh));
  372. ASSERT(buffer_locked(bh));
  373. ASSERT(!buffer_delay(bh));
  374. ASSERT(!buffer_unwritten(bh));
  375. mark_buffer_async_write(bh);
  376. set_buffer_uptodate(bh);
  377. clear_buffer_dirty(bh);
  378. }
  379. STATIC void
  380. xfs_start_page_writeback(
  381. struct page *page,
  382. int clear_dirty,
  383. int buffers)
  384. {
  385. ASSERT(PageLocked(page));
  386. ASSERT(!PageWriteback(page));
  387. if (clear_dirty)
  388. clear_page_dirty_for_io(page);
  389. set_page_writeback(page);
  390. unlock_page(page);
  391. /* If no buffers on the page are to be written, finish it here */
  392. if (!buffers)
  393. end_page_writeback(page);
  394. }
  395. static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
  396. {
  397. return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
  398. }
  399. /*
  400. * Submit all of the bios for all of the ioends we have saved up, covering the
  401. * initial writepage page and also any probed pages.
  402. *
  403. * Because we may have multiple ioends spanning a page, we need to start
  404. * writeback on all the buffers before we submit them for I/O. If we mark the
  405. * buffers as we got, then we can end up with a page that only has buffers
  406. * marked async write and I/O complete on can occur before we mark the other
  407. * buffers async write.
  408. *
  409. * The end result of this is that we trip a bug in end_page_writeback() because
  410. * we call it twice for the one page as the code in end_buffer_async_write()
  411. * assumes that all buffers on the page are started at the same time.
  412. *
  413. * The fix is two passes across the ioend list - one to start writeback on the
  414. * buffer_heads, and then submit them for I/O on the second pass.
  415. */
  416. STATIC void
  417. xfs_submit_ioend(
  418. struct writeback_control *wbc,
  419. xfs_ioend_t *ioend)
  420. {
  421. xfs_ioend_t *head = ioend;
  422. xfs_ioend_t *next;
  423. struct buffer_head *bh;
  424. struct bio *bio;
  425. sector_t lastblock = 0;
  426. /* Pass 1 - start writeback */
  427. do {
  428. next = ioend->io_list;
  429. for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
  430. xfs_start_buffer_writeback(bh);
  431. } while ((ioend = next) != NULL);
  432. /* Pass 2 - submit I/O */
  433. ioend = head;
  434. do {
  435. next = ioend->io_list;
  436. bio = NULL;
  437. for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
  438. if (!bio) {
  439. retry:
  440. bio = xfs_alloc_ioend_bio(bh);
  441. } else if (bh->b_blocknr != lastblock + 1) {
  442. xfs_submit_ioend_bio(wbc, ioend, bio);
  443. goto retry;
  444. }
  445. if (bio_add_buffer(bio, bh) != bh->b_size) {
  446. xfs_submit_ioend_bio(wbc, ioend, bio);
  447. goto retry;
  448. }
  449. lastblock = bh->b_blocknr;
  450. }
  451. if (bio)
  452. xfs_submit_ioend_bio(wbc, ioend, bio);
  453. xfs_finish_ioend(ioend);
  454. } while ((ioend = next) != NULL);
  455. }
  456. /*
  457. * Cancel submission of all buffer_heads so far in this endio.
  458. * Toss the endio too. Only ever called for the initial page
  459. * in a writepage request, so only ever one page.
  460. */
  461. STATIC void
  462. xfs_cancel_ioend(
  463. xfs_ioend_t *ioend)
  464. {
  465. xfs_ioend_t *next;
  466. struct buffer_head *bh, *next_bh;
  467. do {
  468. next = ioend->io_list;
  469. bh = ioend->io_buffer_head;
  470. do {
  471. next_bh = bh->b_private;
  472. clear_buffer_async_write(bh);
  473. unlock_buffer(bh);
  474. } while ((bh = next_bh) != NULL);
  475. mempool_free(ioend, xfs_ioend_pool);
  476. } while ((ioend = next) != NULL);
  477. }
  478. /*
  479. * Test to see if we've been building up a completion structure for
  480. * earlier buffers -- if so, we try to append to this ioend if we
  481. * can, otherwise we finish off any current ioend and start another.
  482. * Return true if we've finished the given ioend.
  483. */
  484. STATIC void
  485. xfs_add_to_ioend(
  486. struct inode *inode,
  487. struct buffer_head *bh,
  488. xfs_off_t offset,
  489. unsigned int type,
  490. xfs_ioend_t **result,
  491. int need_ioend)
  492. {
  493. xfs_ioend_t *ioend = *result;
  494. if (!ioend || need_ioend || type != ioend->io_type) {
  495. xfs_ioend_t *previous = *result;
  496. ioend = xfs_alloc_ioend(inode, type);
  497. ioend->io_offset = offset;
  498. ioend->io_buffer_head = bh;
  499. ioend->io_buffer_tail = bh;
  500. if (previous)
  501. previous->io_list = ioend;
  502. *result = ioend;
  503. } else {
  504. ioend->io_buffer_tail->b_private = bh;
  505. ioend->io_buffer_tail = bh;
  506. }
  507. bh->b_private = NULL;
  508. ioend->io_size += bh->b_size;
  509. }
  510. STATIC void
  511. xfs_map_buffer(
  512. struct inode *inode,
  513. struct buffer_head *bh,
  514. struct xfs_bmbt_irec *imap,
  515. xfs_off_t offset)
  516. {
  517. sector_t bn;
  518. struct xfs_mount *m = XFS_I(inode)->i_mount;
  519. xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
  520. xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
  521. ASSERT(imap->br_startblock != HOLESTARTBLOCK);
  522. ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
  523. bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
  524. ((offset - iomap_offset) >> inode->i_blkbits);
  525. ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
  526. bh->b_blocknr = bn;
  527. set_buffer_mapped(bh);
  528. }
  529. STATIC void
  530. xfs_map_at_offset(
  531. struct inode *inode,
  532. struct buffer_head *bh,
  533. struct xfs_bmbt_irec *imap,
  534. xfs_off_t offset)
  535. {
  536. ASSERT(imap->br_startblock != HOLESTARTBLOCK);
  537. ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
  538. xfs_map_buffer(inode, bh, imap, offset);
  539. set_buffer_mapped(bh);
  540. clear_buffer_delay(bh);
  541. clear_buffer_unwritten(bh);
  542. }
  543. /*
  544. * Test if a given page is suitable for writing as part of an unwritten
  545. * or delayed allocate extent.
  546. */
  547. STATIC int
  548. xfs_is_delayed_page(
  549. struct page *page,
  550. unsigned int type)
  551. {
  552. if (PageWriteback(page))
  553. return 0;
  554. if (page->mapping && page_has_buffers(page)) {
  555. struct buffer_head *bh, *head;
  556. int acceptable = 0;
  557. bh = head = page_buffers(page);
  558. do {
  559. if (buffer_unwritten(bh))
  560. acceptable = (type == IO_UNWRITTEN);
  561. else if (buffer_delay(bh))
  562. acceptable = (type == IO_DELALLOC);
  563. else if (buffer_dirty(bh) && buffer_mapped(bh))
  564. acceptable = (type == IO_OVERWRITE);
  565. else
  566. break;
  567. } while ((bh = bh->b_this_page) != head);
  568. if (acceptable)
  569. return 1;
  570. }
  571. return 0;
  572. }
  573. /*
  574. * Allocate & map buffers for page given the extent map. Write it out.
  575. * except for the original page of a writepage, this is called on
  576. * delalloc/unwritten pages only, for the original page it is possible
  577. * that the page has no mapping at all.
  578. */
  579. STATIC int
  580. xfs_convert_page(
  581. struct inode *inode,
  582. struct page *page,
  583. loff_t tindex,
  584. struct xfs_bmbt_irec *imap,
  585. xfs_ioend_t **ioendp,
  586. struct writeback_control *wbc)
  587. {
  588. struct buffer_head *bh, *head;
  589. xfs_off_t end_offset;
  590. unsigned long p_offset;
  591. unsigned int type;
  592. int len, page_dirty;
  593. int count = 0, done = 0, uptodate = 1;
  594. xfs_off_t offset = page_offset(page);
  595. if (page->index != tindex)
  596. goto fail;
  597. if (!trylock_page(page))
  598. goto fail;
  599. if (PageWriteback(page))
  600. goto fail_unlock_page;
  601. if (page->mapping != inode->i_mapping)
  602. goto fail_unlock_page;
  603. if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
  604. goto fail_unlock_page;
  605. /*
  606. * page_dirty is initially a count of buffers on the page before
  607. * EOF and is decremented as we move each into a cleanable state.
  608. *
  609. * Derivation:
  610. *
  611. * End offset is the highest offset that this page should represent.
  612. * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
  613. * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
  614. * hence give us the correct page_dirty count. On any other page,
  615. * it will be zero and in that case we need page_dirty to be the
  616. * count of buffers on the page.
  617. */
  618. end_offset = min_t(unsigned long long,
  619. (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
  620. i_size_read(inode));
  621. len = 1 << inode->i_blkbits;
  622. p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
  623. PAGE_CACHE_SIZE);
  624. p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
  625. page_dirty = p_offset / len;
  626. bh = head = page_buffers(page);
  627. do {
  628. if (offset >= end_offset)
  629. break;
  630. if (!buffer_uptodate(bh))
  631. uptodate = 0;
  632. if (!(PageUptodate(page) || buffer_uptodate(bh))) {
  633. done = 1;
  634. continue;
  635. }
  636. if (buffer_unwritten(bh) || buffer_delay(bh) ||
  637. buffer_mapped(bh)) {
  638. if (buffer_unwritten(bh))
  639. type = IO_UNWRITTEN;
  640. else if (buffer_delay(bh))
  641. type = IO_DELALLOC;
  642. else
  643. type = IO_OVERWRITE;
  644. if (!xfs_imap_valid(inode, imap, offset)) {
  645. done = 1;
  646. continue;
  647. }
  648. lock_buffer(bh);
  649. if (type != IO_OVERWRITE)
  650. xfs_map_at_offset(inode, bh, imap, offset);
  651. xfs_add_to_ioend(inode, bh, offset, type,
  652. ioendp, done);
  653. page_dirty--;
  654. count++;
  655. } else {
  656. done = 1;
  657. }
  658. } while (offset += len, (bh = bh->b_this_page) != head);
  659. if (uptodate && bh == head)
  660. SetPageUptodate(page);
  661. if (count) {
  662. if (--wbc->nr_to_write <= 0 &&
  663. wbc->sync_mode == WB_SYNC_NONE)
  664. done = 1;
  665. }
  666. xfs_start_page_writeback(page, !page_dirty, count);
  667. return done;
  668. fail_unlock_page:
  669. unlock_page(page);
  670. fail:
  671. return 1;
  672. }
  673. /*
  674. * Convert & write out a cluster of pages in the same extent as defined
  675. * by mp and following the start page.
  676. */
  677. STATIC void
  678. xfs_cluster_write(
  679. struct inode *inode,
  680. pgoff_t tindex,
  681. struct xfs_bmbt_irec *imap,
  682. xfs_ioend_t **ioendp,
  683. struct writeback_control *wbc,
  684. pgoff_t tlast)
  685. {
  686. struct pagevec pvec;
  687. int done = 0, i;
  688. pagevec_init(&pvec, 0);
  689. while (!done && tindex <= tlast) {
  690. unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
  691. if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
  692. break;
  693. for (i = 0; i < pagevec_count(&pvec); i++) {
  694. done = xfs_convert_page(inode, pvec.pages[i], tindex++,
  695. imap, ioendp, wbc);
  696. if (done)
  697. break;
  698. }
  699. pagevec_release(&pvec);
  700. cond_resched();
  701. }
  702. }
  703. STATIC void
  704. xfs_vm_invalidatepage(
  705. struct page *page,
  706. unsigned long offset)
  707. {
  708. trace_xfs_invalidatepage(page->mapping->host, page, offset);
  709. block_invalidatepage(page, offset);
  710. }
  711. /*
  712. * If the page has delalloc buffers on it, we need to punch them out before we
  713. * invalidate the page. If we don't, we leave a stale delalloc mapping on the
  714. * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
  715. * is done on that same region - the delalloc extent is returned when none is
  716. * supposed to be there.
  717. *
  718. * We prevent this by truncating away the delalloc regions on the page before
  719. * invalidating it. Because they are delalloc, we can do this without needing a
  720. * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
  721. * truncation without a transaction as there is no space left for block
  722. * reservation (typically why we see a ENOSPC in writeback).
  723. *
  724. * This is not a performance critical path, so for now just do the punching a
  725. * buffer head at a time.
  726. */
  727. STATIC void
  728. xfs_aops_discard_page(
  729. struct page *page)
  730. {
  731. struct inode *inode = page->mapping->host;
  732. struct xfs_inode *ip = XFS_I(inode);
  733. struct buffer_head *bh, *head;
  734. loff_t offset = page_offset(page);
  735. if (!xfs_is_delayed_page(page, IO_DELALLOC))
  736. goto out_invalidate;
  737. if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  738. goto out_invalidate;
  739. xfs_alert(ip->i_mount,
  740. "page discard on page %p, inode 0x%llx, offset %llu.",
  741. page, ip->i_ino, offset);
  742. xfs_ilock(ip, XFS_ILOCK_EXCL);
  743. bh = head = page_buffers(page);
  744. do {
  745. int error;
  746. xfs_fileoff_t start_fsb;
  747. if (!buffer_delay(bh))
  748. goto next_buffer;
  749. start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
  750. error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
  751. if (error) {
  752. /* something screwed, just bail */
  753. if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
  754. xfs_alert(ip->i_mount,
  755. "page discard unable to remove delalloc mapping.");
  756. }
  757. break;
  758. }
  759. next_buffer:
  760. offset += 1 << inode->i_blkbits;
  761. } while ((bh = bh->b_this_page) != head);
  762. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  763. out_invalidate:
  764. xfs_vm_invalidatepage(page, 0);
  765. return;
  766. }
  767. /*
  768. * Write out a dirty page.
  769. *
  770. * For delalloc space on the page we need to allocate space and flush it.
  771. * For unwritten space on the page we need to start the conversion to
  772. * regular allocated space.
  773. * For any other dirty buffer heads on the page we should flush them.
  774. */
  775. STATIC int
  776. xfs_vm_writepage(
  777. struct page *page,
  778. struct writeback_control *wbc)
  779. {
  780. struct inode *inode = page->mapping->host;
  781. struct buffer_head *bh, *head;
  782. struct xfs_bmbt_irec imap;
  783. xfs_ioend_t *ioend = NULL, *iohead = NULL;
  784. loff_t offset;
  785. unsigned int type;
  786. __uint64_t end_offset;
  787. pgoff_t end_index, last_index;
  788. ssize_t len;
  789. int err, imap_valid = 0, uptodate = 1;
  790. int count = 0;
  791. int nonblocking = 0;
  792. trace_xfs_writepage(inode, page, 0);
  793. ASSERT(page_has_buffers(page));
  794. /*
  795. * Refuse to write the page out if we are called from reclaim context.
  796. *
  797. * This avoids stack overflows when called from deeply used stacks in
  798. * random callers for direct reclaim or memcg reclaim. We explicitly
  799. * allow reclaim from kswapd as the stack usage there is relatively low.
  800. *
  801. * This should never happen except in the case of a VM regression so
  802. * warn about it.
  803. */
  804. if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
  805. PF_MEMALLOC))
  806. goto redirty;
  807. /*
  808. * Given that we do not allow direct reclaim to call us, we should
  809. * never be called while in a filesystem transaction.
  810. */
  811. if (WARN_ON(current->flags & PF_FSTRANS))
  812. goto redirty;
  813. /* Is this page beyond the end of the file? */
  814. offset = i_size_read(inode);
  815. end_index = offset >> PAGE_CACHE_SHIFT;
  816. last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
  817. if (page->index >= end_index) {
  818. if ((page->index >= end_index + 1) ||
  819. !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
  820. unlock_page(page);
  821. return 0;
  822. }
  823. }
  824. end_offset = min_t(unsigned long long,
  825. (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
  826. offset);
  827. len = 1 << inode->i_blkbits;
  828. bh = head = page_buffers(page);
  829. offset = page_offset(page);
  830. type = IO_OVERWRITE;
  831. if (wbc->sync_mode == WB_SYNC_NONE)
  832. nonblocking = 1;
  833. do {
  834. int new_ioend = 0;
  835. if (offset >= end_offset)
  836. break;
  837. if (!buffer_uptodate(bh))
  838. uptodate = 0;
  839. /*
  840. * set_page_dirty dirties all buffers in a page, independent
  841. * of their state. The dirty state however is entirely
  842. * meaningless for holes (!mapped && uptodate), so skip
  843. * buffers covering holes here.
  844. */
  845. if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
  846. imap_valid = 0;
  847. continue;
  848. }
  849. if (buffer_unwritten(bh)) {
  850. if (type != IO_UNWRITTEN) {
  851. type = IO_UNWRITTEN;
  852. imap_valid = 0;
  853. }
  854. } else if (buffer_delay(bh)) {
  855. if (type != IO_DELALLOC) {
  856. type = IO_DELALLOC;
  857. imap_valid = 0;
  858. }
  859. } else if (buffer_uptodate(bh)) {
  860. if (type != IO_OVERWRITE) {
  861. type = IO_OVERWRITE;
  862. imap_valid = 0;
  863. }
  864. } else {
  865. if (PageUptodate(page)) {
  866. ASSERT(buffer_mapped(bh));
  867. imap_valid = 0;
  868. }
  869. continue;
  870. }
  871. if (imap_valid)
  872. imap_valid = xfs_imap_valid(inode, &imap, offset);
  873. if (!imap_valid) {
  874. /*
  875. * If we didn't have a valid mapping then we need to
  876. * put the new mapping into a separate ioend structure.
  877. * This ensures non-contiguous extents always have
  878. * separate ioends, which is particularly important
  879. * for unwritten extent conversion at I/O completion
  880. * time.
  881. */
  882. new_ioend = 1;
  883. err = xfs_map_blocks(inode, offset, &imap, type,
  884. nonblocking);
  885. if (err)
  886. goto error;
  887. imap_valid = xfs_imap_valid(inode, &imap, offset);
  888. }
  889. if (imap_valid) {
  890. lock_buffer(bh);
  891. if (type != IO_OVERWRITE)
  892. xfs_map_at_offset(inode, bh, &imap, offset);
  893. xfs_add_to_ioend(inode, bh, offset, type, &ioend,
  894. new_ioend);
  895. count++;
  896. }
  897. if (!iohead)
  898. iohead = ioend;
  899. } while (offset += len, ((bh = bh->b_this_page) != head));
  900. if (uptodate && bh == head)
  901. SetPageUptodate(page);
  902. xfs_start_page_writeback(page, 1, count);
  903. if (ioend && imap_valid) {
  904. xfs_off_t end_index;
  905. end_index = imap.br_startoff + imap.br_blockcount;
  906. /* to bytes */
  907. end_index <<= inode->i_blkbits;
  908. /* to pages */
  909. end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
  910. /* check against file size */
  911. if (end_index > last_index)
  912. end_index = last_index;
  913. xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
  914. wbc, end_index);
  915. }
  916. if (iohead) {
  917. /*
  918. * Reserve log space if we might write beyond the on-disk
  919. * inode size.
  920. */
  921. if (ioend->io_type != IO_UNWRITTEN &&
  922. xfs_ioend_is_append(ioend)) {
  923. err = xfs_setfilesize_trans_alloc(ioend);
  924. if (err)
  925. goto error;
  926. }
  927. xfs_submit_ioend(wbc, iohead);
  928. }
  929. return 0;
  930. error:
  931. if (iohead)
  932. xfs_cancel_ioend(iohead);
  933. if (err == -EAGAIN)
  934. goto redirty;
  935. xfs_aops_discard_page(page);
  936. ClearPageUptodate(page);
  937. unlock_page(page);
  938. return err;
  939. redirty:
  940. redirty_page_for_writepage(wbc, page);
  941. unlock_page(page);
  942. return 0;
  943. }
  944. STATIC int
  945. xfs_vm_writepages(
  946. struct address_space *mapping,
  947. struct writeback_control *wbc)
  948. {
  949. xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
  950. return generic_writepages(mapping, wbc);
  951. }
  952. /*
  953. * Called to move a page into cleanable state - and from there
  954. * to be released. The page should already be clean. We always
  955. * have buffer heads in this call.
  956. *
  957. * Returns 1 if the page is ok to release, 0 otherwise.
  958. */
  959. STATIC int
  960. xfs_vm_releasepage(
  961. struct page *page,
  962. gfp_t gfp_mask)
  963. {
  964. int delalloc, unwritten;
  965. trace_xfs_releasepage(page->mapping->host, page, 0);
  966. xfs_count_page_state(page, &delalloc, &unwritten);
  967. if (WARN_ON(delalloc))
  968. return 0;
  969. if (WARN_ON(unwritten))
  970. return 0;
  971. return try_to_free_buffers(page);
  972. }
  973. STATIC int
  974. __xfs_get_blocks(
  975. struct inode *inode,
  976. sector_t iblock,
  977. struct buffer_head *bh_result,
  978. int create,
  979. int direct)
  980. {
  981. struct xfs_inode *ip = XFS_I(inode);
  982. struct xfs_mount *mp = ip->i_mount;
  983. xfs_fileoff_t offset_fsb, end_fsb;
  984. int error = 0;
  985. int lockmode = 0;
  986. struct xfs_bmbt_irec imap;
  987. int nimaps = 1;
  988. xfs_off_t offset;
  989. ssize_t size;
  990. int new = 0;
  991. if (XFS_FORCED_SHUTDOWN(mp))
  992. return -XFS_ERROR(EIO);
  993. offset = (xfs_off_t)iblock << inode->i_blkbits;
  994. ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
  995. size = bh_result->b_size;
  996. if (!create && direct && offset >= i_size_read(inode))
  997. return 0;
  998. if (create) {
  999. lockmode = XFS_ILOCK_EXCL;
  1000. xfs_ilock(ip, lockmode);
  1001. } else {
  1002. lockmode = xfs_ilock_map_shared(ip);
  1003. }
  1004. ASSERT(offset <= mp->m_maxioffset);
  1005. if (offset + size > mp->m_maxioffset)
  1006. size = mp->m_maxioffset - offset;
  1007. end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
  1008. offset_fsb = XFS_B_TO_FSBT(mp, offset);
  1009. error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
  1010. &imap, &nimaps, XFS_BMAPI_ENTIRE);
  1011. if (error)
  1012. goto out_unlock;
  1013. if (create &&
  1014. (!nimaps ||
  1015. (imap.br_startblock == HOLESTARTBLOCK ||
  1016. imap.br_startblock == DELAYSTARTBLOCK))) {
  1017. if (direct) {
  1018. error = xfs_iomap_write_direct(ip, offset, size,
  1019. &imap, nimaps);
  1020. } else {
  1021. error = xfs_iomap_write_delay(ip, offset, size, &imap);
  1022. }
  1023. if (error)
  1024. goto out_unlock;
  1025. trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
  1026. } else if (nimaps) {
  1027. trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
  1028. } else {
  1029. trace_xfs_get_blocks_notfound(ip, offset, size);
  1030. goto out_unlock;
  1031. }
  1032. xfs_iunlock(ip, lockmode);
  1033. if (imap.br_startblock != HOLESTARTBLOCK &&
  1034. imap.br_startblock != DELAYSTARTBLOCK) {
  1035. /*
  1036. * For unwritten extents do not report a disk address on
  1037. * the read case (treat as if we're reading into a hole).
  1038. */
  1039. if (create || !ISUNWRITTEN(&imap))
  1040. xfs_map_buffer(inode, bh_result, &imap, offset);
  1041. if (create && ISUNWRITTEN(&imap)) {
  1042. if (direct)
  1043. bh_result->b_private = inode;
  1044. set_buffer_unwritten(bh_result);
  1045. }
  1046. }
  1047. /*
  1048. * If this is a realtime file, data may be on a different device.
  1049. * to that pointed to from the buffer_head b_bdev currently.
  1050. */
  1051. bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
  1052. /*
  1053. * If we previously allocated a block out beyond eof and we are now
  1054. * coming back to use it then we will need to flag it as new even if it
  1055. * has a disk address.
  1056. *
  1057. * With sub-block writes into unwritten extents we also need to mark
  1058. * the buffer as new so that the unwritten parts of the buffer gets
  1059. * correctly zeroed.
  1060. */
  1061. if (create &&
  1062. ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
  1063. (offset >= i_size_read(inode)) ||
  1064. (new || ISUNWRITTEN(&imap))))
  1065. set_buffer_new(bh_result);
  1066. if (imap.br_startblock == DELAYSTARTBLOCK) {
  1067. BUG_ON(direct);
  1068. if (create) {
  1069. set_buffer_uptodate(bh_result);
  1070. set_buffer_mapped(bh_result);
  1071. set_buffer_delay(bh_result);
  1072. }
  1073. }
  1074. /*
  1075. * If this is O_DIRECT or the mpage code calling tell them how large
  1076. * the mapping is, so that we can avoid repeated get_blocks calls.
  1077. */
  1078. if (direct || size > (1 << inode->i_blkbits)) {
  1079. xfs_off_t mapping_size;
  1080. mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
  1081. mapping_size <<= inode->i_blkbits;
  1082. ASSERT(mapping_size > 0);
  1083. if (mapping_size > size)
  1084. mapping_size = size;
  1085. if (mapping_size > LONG_MAX)
  1086. mapping_size = LONG_MAX;
  1087. bh_result->b_size = mapping_size;
  1088. }
  1089. return 0;
  1090. out_unlock:
  1091. xfs_iunlock(ip, lockmode);
  1092. return -error;
  1093. }
  1094. int
  1095. xfs_get_blocks(
  1096. struct inode *inode,
  1097. sector_t iblock,
  1098. struct buffer_head *bh_result,
  1099. int create)
  1100. {
  1101. return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
  1102. }
  1103. STATIC int
  1104. xfs_get_blocks_direct(
  1105. struct inode *inode,
  1106. sector_t iblock,
  1107. struct buffer_head *bh_result,
  1108. int create)
  1109. {
  1110. return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
  1111. }
  1112. /*
  1113. * Complete a direct I/O write request.
  1114. *
  1115. * If the private argument is non-NULL __xfs_get_blocks signals us that we
  1116. * need to issue a transaction to convert the range from unwritten to written
  1117. * extents. In case this is regular synchronous I/O we just call xfs_end_io
  1118. * to do this and we are done. But in case this was a successful AIO
  1119. * request this handler is called from interrupt context, from which we
  1120. * can't start transactions. In that case offload the I/O completion to
  1121. * the workqueues we also use for buffered I/O completion.
  1122. */
  1123. STATIC void
  1124. xfs_end_io_direct_write(
  1125. struct kiocb *iocb,
  1126. loff_t offset,
  1127. ssize_t size,
  1128. void *private,
  1129. int ret,
  1130. bool is_async)
  1131. {
  1132. struct xfs_ioend *ioend = iocb->private;
  1133. /*
  1134. * While the generic direct I/O code updates the inode size, it does
  1135. * so only after the end_io handler is called, which means our
  1136. * end_io handler thinks the on-disk size is outside the in-core
  1137. * size. To prevent this just update it a little bit earlier here.
  1138. */
  1139. if (offset + size > i_size_read(ioend->io_inode))
  1140. i_size_write(ioend->io_inode, offset + size);
  1141. /*
  1142. * blockdev_direct_IO can return an error even after the I/O
  1143. * completion handler was called. Thus we need to protect
  1144. * against double-freeing.
  1145. */
  1146. iocb->private = NULL;
  1147. ioend->io_offset = offset;
  1148. ioend->io_size = size;
  1149. ioend->io_iocb = iocb;
  1150. ioend->io_result = ret;
  1151. if (private && size > 0)
  1152. ioend->io_type = IO_UNWRITTEN;
  1153. if (is_async) {
  1154. ioend->io_isasync = 1;
  1155. xfs_finish_ioend(ioend);
  1156. } else {
  1157. xfs_finish_ioend_sync(ioend);
  1158. }
  1159. }
  1160. STATIC ssize_t
  1161. xfs_vm_direct_IO(
  1162. int rw,
  1163. struct kiocb *iocb,
  1164. const struct iovec *iov,
  1165. loff_t offset,
  1166. unsigned long nr_segs)
  1167. {
  1168. struct inode *inode = iocb->ki_filp->f_mapping->host;
  1169. struct block_device *bdev = xfs_find_bdev_for_inode(inode);
  1170. struct xfs_ioend *ioend = NULL;
  1171. ssize_t ret;
  1172. if (rw & WRITE) {
  1173. size_t size = iov_length(iov, nr_segs);
  1174. /*
  1175. * We need to preallocate a transaction for a size update
  1176. * here. In the case that this write both updates the size
  1177. * and converts at least on unwritten extent we will cancel
  1178. * the still clean transaction after the I/O has finished.
  1179. */
  1180. iocb->private = ioend = xfs_alloc_ioend(inode, IO_DIRECT);
  1181. if (offset + size > XFS_I(inode)->i_d.di_size) {
  1182. ret = xfs_setfilesize_trans_alloc(ioend);
  1183. if (ret)
  1184. goto out_destroy_ioend;
  1185. ioend->io_isdirect = 1;
  1186. }
  1187. ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
  1188. offset, nr_segs,
  1189. xfs_get_blocks_direct,
  1190. xfs_end_io_direct_write, NULL, 0);
  1191. if (ret != -EIOCBQUEUED && iocb->private)
  1192. goto out_trans_cancel;
  1193. } else {
  1194. ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
  1195. offset, nr_segs,
  1196. xfs_get_blocks_direct,
  1197. NULL, NULL, 0);
  1198. }
  1199. return ret;
  1200. out_trans_cancel:
  1201. if (ioend->io_append_trans) {
  1202. current_set_flags_nested(&ioend->io_append_trans->t_pflags,
  1203. PF_FSTRANS);
  1204. xfs_trans_cancel(ioend->io_append_trans, 0);
  1205. }
  1206. out_destroy_ioend:
  1207. xfs_destroy_ioend(ioend);
  1208. return ret;
  1209. }
  1210. STATIC void
  1211. xfs_vm_write_failed(
  1212. struct address_space *mapping,
  1213. loff_t to)
  1214. {
  1215. struct inode *inode = mapping->host;
  1216. if (to > inode->i_size) {
  1217. /*
  1218. * Punch out the delalloc blocks we have already allocated.
  1219. *
  1220. * Don't bother with xfs_setattr given that nothing can have
  1221. * made it to disk yet as the page is still locked at this
  1222. * point.
  1223. */
  1224. struct xfs_inode *ip = XFS_I(inode);
  1225. xfs_fileoff_t start_fsb;
  1226. xfs_fileoff_t end_fsb;
  1227. int error;
  1228. truncate_pagecache(inode, to, inode->i_size);
  1229. /*
  1230. * Check if there are any blocks that are outside of i_size
  1231. * that need to be trimmed back.
  1232. */
  1233. start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
  1234. end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
  1235. if (end_fsb <= start_fsb)
  1236. return;
  1237. xfs_ilock(ip, XFS_ILOCK_EXCL);
  1238. error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
  1239. end_fsb - start_fsb);
  1240. if (error) {
  1241. /* something screwed, just bail */
  1242. if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
  1243. xfs_alert(ip->i_mount,
  1244. "xfs_vm_write_failed: unable to clean up ino %lld",
  1245. ip->i_ino);
  1246. }
  1247. }
  1248. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  1249. }
  1250. }
  1251. STATIC int
  1252. xfs_vm_write_begin(
  1253. struct file *file,
  1254. struct address_space *mapping,
  1255. loff_t pos,
  1256. unsigned len,
  1257. unsigned flags,
  1258. struct page **pagep,
  1259. void **fsdata)
  1260. {
  1261. int ret;
  1262. ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS,
  1263. pagep, xfs_get_blocks);
  1264. if (unlikely(ret))
  1265. xfs_vm_write_failed(mapping, pos + len);
  1266. return ret;
  1267. }
  1268. STATIC int
  1269. xfs_vm_write_end(
  1270. struct file *file,
  1271. struct address_space *mapping,
  1272. loff_t pos,
  1273. unsigned len,
  1274. unsigned copied,
  1275. struct page *page,
  1276. void *fsdata)
  1277. {
  1278. int ret;
  1279. ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
  1280. if (unlikely(ret < len))
  1281. xfs_vm_write_failed(mapping, pos + len);
  1282. return ret;
  1283. }
  1284. STATIC sector_t
  1285. xfs_vm_bmap(
  1286. struct address_space *mapping,
  1287. sector_t block)
  1288. {
  1289. struct inode *inode = (struct inode *)mapping->host;
  1290. struct xfs_inode *ip = XFS_I(inode);
  1291. trace_xfs_vm_bmap(XFS_I(inode));
  1292. xfs_ilock(ip, XFS_IOLOCK_SHARED);
  1293. xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
  1294. xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  1295. return generic_block_bmap(mapping, block, xfs_get_blocks);
  1296. }
  1297. STATIC int
  1298. xfs_vm_readpage(
  1299. struct file *unused,
  1300. struct page *page)
  1301. {
  1302. return mpage_readpage(page, xfs_get_blocks);
  1303. }
  1304. STATIC int
  1305. xfs_vm_readpages(
  1306. struct file *unused,
  1307. struct address_space *mapping,
  1308. struct list_head *pages,
  1309. unsigned nr_pages)
  1310. {
  1311. return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
  1312. }
  1313. const struct address_space_operations xfs_address_space_operations = {
  1314. .readpage = xfs_vm_readpage,
  1315. .readpages = xfs_vm_readpages,
  1316. .writepage = xfs_vm_writepage,
  1317. .writepages = xfs_vm_writepages,
  1318. .releasepage = xfs_vm_releasepage,
  1319. .invalidatepage = xfs_vm_invalidatepage,
  1320. .write_begin = xfs_vm_write_begin,
  1321. .write_end = xfs_vm_write_end,
  1322. .bmap = xfs_vm_bmap,
  1323. .direct_IO = xfs_vm_direct_IO,
  1324. .migratepage = buffer_migrate_page,
  1325. .is_partially_uptodate = block_is_partially_uptodate,
  1326. .error_remove_page = generic_error_remove_page,
  1327. };