WikiImporter.php 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128
  1. <?php
  2. /**
  3. * MediaWiki page data importer.
  4. *
  5. * Copyright © 2003,2005 Brion Vibber <brion@pobox.com>
  6. * https://www.mediawiki.org/
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation; either version 2 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License along
  19. * with this program; if not, write to the Free Software Foundation, Inc.,
  20. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21. * http://www.gnu.org/copyleft/gpl.html
  22. *
  23. * @file
  24. * @ingroup SpecialPage
  25. */
  26. use MediaWiki\MediaWikiServices;
  27. /**
  28. * XML file reader for the page data importer.
  29. *
  30. * implements Special:Import
  31. * @ingroup SpecialPage
  32. */
  33. class WikiImporter {
  34. /** @var XMLReader */
  35. private $reader;
  36. private $foreignNamespaces = null;
  37. private $mLogItemCallback, $mUploadCallback, $mRevisionCallback, $mPageCallback;
  38. private $mSiteInfoCallback, $mPageOutCallback;
  39. private $mNoticeCallback, $mDebug;
  40. private $mImportUploads, $mImageBasePath;
  41. private $mNoUpdates = false;
  42. private $pageOffset = 0;
  43. /** @var Config */
  44. private $config;
  45. /** @var ImportTitleFactory */
  46. private $importTitleFactory;
  47. /** @var array */
  48. private $countableCache = [];
  49. /** @var bool */
  50. private $disableStatisticsUpdate = false;
  51. /** @var ExternalUserNames */
  52. private $externalUserNames;
  53. /**
  54. * Creates an ImportXMLReader drawing from the source provided
  55. * @param ImportSource $source
  56. * @param Config $config
  57. * @throws Exception
  58. */
  59. function __construct( ImportSource $source, Config $config ) {
  60. if ( !class_exists( 'XMLReader' ) ) {
  61. throw new Exception( 'Import requires PHP to have been compiled with libxml support' );
  62. }
  63. $this->reader = new XMLReader();
  64. $this->config = $config;
  65. if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
  66. stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
  67. }
  68. $id = UploadSourceAdapter::registerSource( $source );
  69. // Enable the entity loader, as it is needed for loading external URLs via
  70. // XMLReader::open (T86036)
  71. $oldDisable = libxml_disable_entity_loader( false );
  72. if ( defined( 'LIBXML_PARSEHUGE' ) ) {
  73. $status = $this->reader->open( "uploadsource://$id", null, LIBXML_PARSEHUGE );
  74. } else {
  75. $status = $this->reader->open( "uploadsource://$id" );
  76. }
  77. if ( !$status ) {
  78. $error = libxml_get_last_error();
  79. libxml_disable_entity_loader( $oldDisable );
  80. throw new MWException( 'Encountered an internal error while initializing WikiImporter object: ' .
  81. $error->message );
  82. }
  83. libxml_disable_entity_loader( $oldDisable );
  84. // Default callbacks
  85. $this->setPageCallback( [ $this, 'beforeImportPage' ] );
  86. $this->setRevisionCallback( [ $this, "importRevision" ] );
  87. $this->setUploadCallback( [ $this, 'importUpload' ] );
  88. $this->setLogItemCallback( [ $this, 'importLogItem' ] );
  89. $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
  90. $this->importTitleFactory = new NaiveImportTitleFactory();
  91. $this->externalUserNames = new ExternalUserNames( 'imported', false );
  92. }
  93. /**
  94. * @return null|XMLReader
  95. */
  96. public function getReader() {
  97. return $this->reader;
  98. }
  99. public function throwXmlError( $err ) {
  100. $this->debug( "FAILURE: $err" );
  101. wfDebug( "WikiImporter XML error: $err\n" );
  102. }
  103. public function debug( $data ) {
  104. if ( $this->mDebug ) {
  105. wfDebug( "IMPORT: $data\n" );
  106. }
  107. }
  108. public function warn( $data ) {
  109. wfDebug( "IMPORT: $data\n" );
  110. }
  111. public function notice( $msg, ...$params ) {
  112. if ( is_callable( $this->mNoticeCallback ) ) {
  113. call_user_func( $this->mNoticeCallback, $msg, $params );
  114. } else { # No ImportReporter -> CLI
  115. // T177997: the command line importers should call setNoticeCallback()
  116. // for their own custom callback to echo the notice
  117. wfDebug( wfMessage( $msg, $params )->text() . "\n" );
  118. }
  119. }
  120. /**
  121. * Set debug mode...
  122. * @param bool $debug
  123. */
  124. function setDebug( $debug ) {
  125. $this->mDebug = $debug;
  126. }
  127. /**
  128. * Set 'no updates' mode. In this mode, the link tables will not be updated by the importer
  129. * @param bool $noupdates
  130. */
  131. function setNoUpdates( $noupdates ) {
  132. $this->mNoUpdates = $noupdates;
  133. }
  134. /**
  135. * Sets 'pageOffset' value. So it will skip the first n-1 pages
  136. * and start from the nth page. It's 1-based indexing.
  137. * @param int $nthPage
  138. * @since 1.29
  139. */
  140. function setPageOffset( $nthPage ) {
  141. $this->pageOffset = $nthPage;
  142. }
  143. /**
  144. * Set a callback that displays notice messages
  145. *
  146. * @param callable $callback
  147. * @return callable
  148. */
  149. public function setNoticeCallback( $callback ) {
  150. return wfSetVar( $this->mNoticeCallback, $callback );
  151. }
  152. /**
  153. * Sets the action to perform as each new page in the stream is reached.
  154. * @param callable $callback
  155. * @return callable
  156. */
  157. public function setPageCallback( $callback ) {
  158. $previous = $this->mPageCallback;
  159. $this->mPageCallback = $callback;
  160. return $previous;
  161. }
  162. /**
  163. * Sets the action to perform as each page in the stream is completed.
  164. * Callback accepts the page title (as a Title object), a second object
  165. * with the original title form (in case it's been overridden into a
  166. * local namespace), and a count of revisions.
  167. *
  168. * @param callable $callback
  169. * @return callable
  170. */
  171. public function setPageOutCallback( $callback ) {
  172. $previous = $this->mPageOutCallback;
  173. $this->mPageOutCallback = $callback;
  174. return $previous;
  175. }
  176. /**
  177. * Sets the action to perform as each page revision is reached.
  178. * @param callable $callback
  179. * @return callable
  180. */
  181. public function setRevisionCallback( $callback ) {
  182. $previous = $this->mRevisionCallback;
  183. $this->mRevisionCallback = $callback;
  184. return $previous;
  185. }
  186. /**
  187. * Sets the action to perform as each file upload version is reached.
  188. * @param callable $callback
  189. * @return callable
  190. */
  191. public function setUploadCallback( $callback ) {
  192. $previous = $this->mUploadCallback;
  193. $this->mUploadCallback = $callback;
  194. return $previous;
  195. }
  196. /**
  197. * Sets the action to perform as each log item reached.
  198. * @param callable $callback
  199. * @return callable
  200. */
  201. public function setLogItemCallback( $callback ) {
  202. $previous = $this->mLogItemCallback;
  203. $this->mLogItemCallback = $callback;
  204. return $previous;
  205. }
  206. /**
  207. * Sets the action to perform when site info is encountered
  208. * @param callable $callback
  209. * @return callable
  210. */
  211. public function setSiteInfoCallback( $callback ) {
  212. $previous = $this->mSiteInfoCallback;
  213. $this->mSiteInfoCallback = $callback;
  214. return $previous;
  215. }
  216. /**
  217. * Sets the factory object to use to convert ForeignTitle objects into local
  218. * Title objects
  219. * @param ImportTitleFactory $factory
  220. */
  221. public function setImportTitleFactory( $factory ) {
  222. $this->importTitleFactory = $factory;
  223. }
  224. /**
  225. * Set a target namespace to override the defaults
  226. * @param null|int $namespace
  227. * @return bool
  228. */
  229. public function setTargetNamespace( $namespace ) {
  230. if ( is_null( $namespace ) ) {
  231. // Don't override namespaces
  232. $this->setImportTitleFactory( new NaiveImportTitleFactory() );
  233. return true;
  234. } elseif (
  235. $namespace >= 0 &&
  236. MediaWikiServices::getInstance()->getNamespaceInfo()->exists( intval( $namespace ) )
  237. ) {
  238. $namespace = intval( $namespace );
  239. $this->setImportTitleFactory( new NamespaceImportTitleFactory( $namespace ) );
  240. return true;
  241. } else {
  242. return false;
  243. }
  244. }
  245. /**
  246. * Set a target root page under which all pages are imported
  247. * @param null|string $rootpage
  248. * @return Status
  249. */
  250. public function setTargetRootPage( $rootpage ) {
  251. $status = Status::newGood();
  252. if ( is_null( $rootpage ) ) {
  253. // No rootpage
  254. $this->setImportTitleFactory( new NaiveImportTitleFactory() );
  255. } elseif ( $rootpage !== '' ) {
  256. $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
  257. $title = Title::newFromText( $rootpage );
  258. if ( !$title || $title->isExternal() ) {
  259. $status->fatal( 'import-rootpage-invalid' );
  260. } elseif (
  261. !MediaWikiServices::getInstance()->getNamespaceInfo()->
  262. hasSubpages( $title->getNamespace() )
  263. ) {
  264. $displayNSText = $title->getNamespace() == NS_MAIN
  265. ? wfMessage( 'blanknamespace' )->text()
  266. : MediaWikiServices::getInstance()->getContentLanguage()->
  267. getNsText( $title->getNamespace() );
  268. $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
  269. } else {
  270. // set namespace to 'all', so the namespace check in processTitle() can pass
  271. $this->setTargetNamespace( null );
  272. $this->setImportTitleFactory( new SubpageImportTitleFactory( $title ) );
  273. }
  274. }
  275. return $status;
  276. }
  277. /**
  278. * @param string $dir
  279. */
  280. public function setImageBasePath( $dir ) {
  281. $this->mImageBasePath = $dir;
  282. }
  283. /**
  284. * @param bool $import
  285. */
  286. public function setImportUploads( $import ) {
  287. $this->mImportUploads = $import;
  288. }
  289. /**
  290. * @since 1.31
  291. * @param string $usernamePrefix Prefix to apply to unknown (and possibly also known) usernames
  292. * @param bool $assignKnownUsers Whether to apply the prefix to usernames that exist locally
  293. */
  294. public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
  295. $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
  296. }
  297. /**
  298. * Statistics update can cause a lot of time
  299. * @since 1.29
  300. */
  301. public function disableStatisticsUpdate() {
  302. $this->disableStatisticsUpdate = true;
  303. }
  304. /**
  305. * Default per-page callback. Sets up some things related to site statistics
  306. * @param array $titleAndForeignTitle Two-element array, with Title object at
  307. * index 0 and ForeignTitle object at index 1
  308. * @return bool
  309. */
  310. public function beforeImportPage( $titleAndForeignTitle ) {
  311. $title = $titleAndForeignTitle[0];
  312. $page = WikiPage::factory( $title );
  313. $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
  314. return true;
  315. }
  316. /**
  317. * Default per-revision callback, performs the import.
  318. * @param WikiRevision $revision
  319. * @return bool
  320. */
  321. public function importRevision( $revision ) {
  322. if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
  323. $this->notice( 'import-error-bad-location',
  324. $revision->getTitle()->getPrefixedText(),
  325. $revision->getID(),
  326. $revision->getModel(),
  327. $revision->getFormat() );
  328. return false;
  329. }
  330. try {
  331. return $revision->importOldRevision();
  332. } catch ( MWContentSerializationException $ex ) {
  333. $this->notice( 'import-error-unserialize',
  334. $revision->getTitle()->getPrefixedText(),
  335. $revision->getID(),
  336. $revision->getModel(),
  337. $revision->getFormat() );
  338. }
  339. return false;
  340. }
  341. /**
  342. * Default per-revision callback, performs the import.
  343. * @param WikiRevision $revision
  344. * @return bool
  345. */
  346. public function importLogItem( $revision ) {
  347. return $revision->importLogItem();
  348. }
  349. /**
  350. * Dummy for now...
  351. * @param WikiRevision $revision
  352. * @return bool
  353. */
  354. public function importUpload( $revision ) {
  355. return $revision->importUpload();
  356. }
  357. /**
  358. * Mostly for hook use
  359. * @param Title $title
  360. * @param ForeignTitle $foreignTitle
  361. * @param int $revCount
  362. * @param int $sRevCount
  363. * @param array $pageInfo
  364. * @return bool
  365. */
  366. public function finishImportPage( $title, $foreignTitle, $revCount,
  367. $sRevCount, $pageInfo
  368. ) {
  369. // Update article count statistics (T42009)
  370. // The normal counting logic in WikiPage->doEditUpdates() is designed for
  371. // one-revision-at-a-time editing, not bulk imports. In this situation it
  372. // suffers from issues of replica DB lag. We let WikiPage handle the total page
  373. // and revision count, and we implement our own custom logic for the
  374. // article (content page) count.
  375. if ( !$this->disableStatisticsUpdate ) {
  376. $page = WikiPage::factory( $title );
  377. $page->loadPageData( 'fromdbmaster' );
  378. $content = $page->getContent();
  379. if ( $content === null ) {
  380. wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $title .
  381. ' because WikiPage::getContent() returned null' );
  382. } else {
  383. $editInfo = $page->prepareContentForEdit( $content );
  384. $countKey = 'title_' . $title->getPrefixedText();
  385. $countable = $page->isCountable( $editInfo );
  386. if ( array_key_exists( $countKey, $this->countableCache ) &&
  387. $countable != $this->countableCache[$countKey] ) {
  388. DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [
  389. 'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
  390. ] ) );
  391. }
  392. }
  393. }
  394. return Hooks::run( 'AfterImportPage', func_get_args() );
  395. }
  396. /**
  397. * Alternate per-revision callback, for debugging.
  398. * @param WikiRevision &$revision
  399. */
  400. public function debugRevisionHandler( &$revision ) {
  401. $this->debug( "Got revision:" );
  402. if ( is_object( $revision->title ) ) {
  403. $this->debug( "-- Title: " . $revision->title->getPrefixedText() );
  404. } else {
  405. $this->debug( "-- Title: <invalid>" );
  406. }
  407. $this->debug( "-- User: " . $revision->user_text );
  408. $this->debug( "-- Timestamp: " . $revision->timestamp );
  409. $this->debug( "-- Comment: " . $revision->comment );
  410. $this->debug( "-- Text: " . $revision->text );
  411. }
  412. /**
  413. * Notify the callback function of site info
  414. * @param array $siteInfo
  415. * @return bool|mixed
  416. */
  417. private function siteInfoCallback( $siteInfo ) {
  418. if ( isset( $this->mSiteInfoCallback ) ) {
  419. return call_user_func_array( $this->mSiteInfoCallback,
  420. [ $siteInfo, $this ] );
  421. } else {
  422. return false;
  423. }
  424. }
  425. /**
  426. * Notify the callback function when a new "<page>" is reached.
  427. * @param array $title
  428. */
  429. function pageCallback( $title ) {
  430. if ( isset( $this->mPageCallback ) ) {
  431. call_user_func( $this->mPageCallback, $title );
  432. }
  433. }
  434. /**
  435. * Notify the callback function when a "</page>" is closed.
  436. * @param Title $title
  437. * @param ForeignTitle $foreignTitle
  438. * @param int $revCount
  439. * @param int $sucCount Number of revisions for which callback returned true
  440. * @param array $pageInfo Associative array of page information
  441. */
  442. private function pageOutCallback( $title, $foreignTitle, $revCount,
  443. $sucCount, $pageInfo ) {
  444. if ( isset( $this->mPageOutCallback ) ) {
  445. call_user_func_array( $this->mPageOutCallback, func_get_args() );
  446. }
  447. }
  448. /**
  449. * Notify the callback function of a revision
  450. * @param WikiRevision $revision
  451. * @return bool|mixed
  452. */
  453. private function revisionCallback( $revision ) {
  454. if ( isset( $this->mRevisionCallback ) ) {
  455. return call_user_func_array( $this->mRevisionCallback,
  456. [ $revision, $this ] );
  457. } else {
  458. return false;
  459. }
  460. }
  461. /**
  462. * Notify the callback function of a new log item
  463. * @param WikiRevision $revision
  464. * @return bool|mixed
  465. */
  466. private function logItemCallback( $revision ) {
  467. if ( isset( $this->mLogItemCallback ) ) {
  468. return call_user_func_array( $this->mLogItemCallback,
  469. [ $revision, $this ] );
  470. } else {
  471. return false;
  472. }
  473. }
  474. /**
  475. * Retrieves the contents of the named attribute of the current element.
  476. * @param string $attr The name of the attribute
  477. * @return string The value of the attribute or an empty string if it is not set in the current
  478. * element.
  479. */
  480. public function nodeAttribute( $attr ) {
  481. return $this->reader->getAttribute( $attr );
  482. }
  483. /**
  484. * Shouldn't something like this be built-in to XMLReader?
  485. * Fetches text contents of the current element, assuming
  486. * no sub-elements or such scary things.
  487. * @return string
  488. * @private
  489. */
  490. public function nodeContents() {
  491. if ( $this->reader->isEmptyElement ) {
  492. return "";
  493. }
  494. $buffer = "";
  495. while ( $this->reader->read() ) {
  496. switch ( $this->reader->nodeType ) {
  497. case XMLReader::TEXT:
  498. case XMLReader::CDATA:
  499. case XMLReader::SIGNIFICANT_WHITESPACE:
  500. $buffer .= $this->reader->value;
  501. break;
  502. case XMLReader::END_ELEMENT:
  503. return $buffer;
  504. }
  505. }
  506. $this->reader->close();
  507. return '';
  508. }
  509. /**
  510. * Primary entry point
  511. * @throws Exception
  512. * @throws MWException
  513. * @return bool
  514. */
  515. public function doImport() {
  516. // Calls to reader->read need to be wrapped in calls to
  517. // libxml_disable_entity_loader() to avoid local file
  518. // inclusion attacks (T48932).
  519. $oldDisable = libxml_disable_entity_loader( true );
  520. $this->reader->read();
  521. if ( $this->reader->localName != 'mediawiki' ) {
  522. libxml_disable_entity_loader( $oldDisable );
  523. throw new MWException( "Expected <mediawiki> tag, got " .
  524. $this->reader->localName );
  525. }
  526. $this->debug( "<mediawiki> tag is correct." );
  527. $this->debug( "Starting primary dump processing loop." );
  528. $keepReading = $this->reader->read();
  529. $skip = false;
  530. $rethrow = null;
  531. $pageCount = 0;
  532. try {
  533. while ( $keepReading ) {
  534. $tag = $this->reader->localName;
  535. if ( $this->pageOffset ) {
  536. if ( $tag === 'page' ) {
  537. $pageCount++;
  538. }
  539. if ( $pageCount < $this->pageOffset ) {
  540. $keepReading = $this->reader->next();
  541. continue;
  542. }
  543. }
  544. $type = $this->reader->nodeType;
  545. if ( !Hooks::run( 'ImportHandleToplevelXMLTag', [ $this ] ) ) {
  546. // Do nothing
  547. } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
  548. break;
  549. } elseif ( $tag == 'siteinfo' ) {
  550. $this->handleSiteInfo();
  551. } elseif ( $tag == 'page' ) {
  552. $this->handlePage();
  553. } elseif ( $tag == 'logitem' ) {
  554. $this->handleLogItem();
  555. } elseif ( $tag != '#text' ) {
  556. $this->warn( "Unhandled top-level XML tag $tag" );
  557. $skip = true;
  558. }
  559. if ( $skip ) {
  560. $keepReading = $this->reader->next();
  561. $skip = false;
  562. $this->debug( "Skip" );
  563. } else {
  564. $keepReading = $this->reader->read();
  565. }
  566. }
  567. } catch ( Exception $ex ) {
  568. $rethrow = $ex;
  569. }
  570. // finally
  571. libxml_disable_entity_loader( $oldDisable );
  572. $this->reader->close();
  573. if ( $rethrow ) {
  574. throw $rethrow;
  575. }
  576. return true;
  577. }
  578. private function handleSiteInfo() {
  579. $this->debug( "Enter site info handler." );
  580. $siteInfo = [];
  581. // Fields that can just be stuffed in the siteInfo object
  582. $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
  583. while ( $this->reader->read() ) {
  584. if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
  585. $this->reader->localName == 'siteinfo' ) {
  586. break;
  587. }
  588. $tag = $this->reader->localName;
  589. if ( $tag == 'namespace' ) {
  590. $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
  591. $this->nodeContents();
  592. } elseif ( in_array( $tag, $normalFields ) ) {
  593. $siteInfo[$tag] = $this->nodeContents();
  594. }
  595. }
  596. $siteInfo['_namespaces'] = $this->foreignNamespaces;
  597. $this->siteInfoCallback( $siteInfo );
  598. }
  599. private function handleLogItem() {
  600. $this->debug( "Enter log item handler." );
  601. $logInfo = [];
  602. // Fields that can just be stuffed in the pageInfo object
  603. $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
  604. 'logtitle', 'params' ];
  605. while ( $this->reader->read() ) {
  606. if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
  607. $this->reader->localName == 'logitem' ) {
  608. break;
  609. }
  610. $tag = $this->reader->localName;
  611. if ( !Hooks::run( 'ImportHandleLogItemXMLTag', [
  612. $this, $logInfo
  613. ] ) ) {
  614. // Do nothing
  615. } elseif ( in_array( $tag, $normalFields ) ) {
  616. $logInfo[$tag] = $this->nodeContents();
  617. } elseif ( $tag == 'contributor' ) {
  618. $logInfo['contributor'] = $this->handleContributor();
  619. } elseif ( $tag != '#text' ) {
  620. $this->warn( "Unhandled log-item XML tag $tag" );
  621. }
  622. }
  623. $this->processLogItem( $logInfo );
  624. }
  625. /**
  626. * @param array $logInfo
  627. * @return bool|mixed
  628. */
  629. private function processLogItem( $logInfo ) {
  630. $revision = new WikiRevision( $this->config );
  631. if ( isset( $logInfo['id'] ) ) {
  632. $revision->setID( $logInfo['id'] );
  633. }
  634. $revision->setType( $logInfo['type'] );
  635. $revision->setAction( $logInfo['action'] );
  636. if ( isset( $logInfo['timestamp'] ) ) {
  637. $revision->setTimestamp( $logInfo['timestamp'] );
  638. }
  639. if ( isset( $logInfo['params'] ) ) {
  640. $revision->setParams( $logInfo['params'] );
  641. }
  642. if ( isset( $logInfo['logtitle'] ) ) {
  643. // @todo Using Title for non-local titles is a recipe for disaster.
  644. // We should use ForeignTitle here instead.
  645. $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
  646. }
  647. $revision->setNoUpdates( $this->mNoUpdates );
  648. if ( isset( $logInfo['comment'] ) ) {
  649. $revision->setComment( $logInfo['comment'] );
  650. }
  651. if ( isset( $logInfo['contributor']['ip'] ) ) {
  652. $revision->setUserIP( $logInfo['contributor']['ip'] );
  653. }
  654. if ( !isset( $logInfo['contributor']['username'] ) ) {
  655. $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
  656. } else {
  657. $revision->setUsername(
  658. $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
  659. );
  660. }
  661. return $this->logItemCallback( $revision );
  662. }
  663. /**
  664. * @suppress PhanTypeInvalidDimOffset Phan not reading the reference inside the hook
  665. */
  666. private function handlePage() {
  667. // Handle page data.
  668. $this->debug( "Enter page handler." );
  669. $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
  670. // Fields that can just be stuffed in the pageInfo object
  671. $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
  672. $skip = false;
  673. $badTitle = false;
  674. while ( $skip ? $this->reader->next() : $this->reader->read() ) {
  675. if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
  676. $this->reader->localName == 'page' ) {
  677. break;
  678. }
  679. $skip = false;
  680. $tag = $this->reader->localName;
  681. if ( $badTitle ) {
  682. // The title is invalid, bail out of this page
  683. $skip = true;
  684. } elseif ( !Hooks::run( 'ImportHandlePageXMLTag', [ $this,
  685. &$pageInfo ] ) ) {
  686. // Do nothing
  687. } elseif ( in_array( $tag, $normalFields ) ) {
  688. // An XML snippet:
  689. // <page>
  690. // <id>123</id>
  691. // <title>Page</title>
  692. // <redirect title="NewTitle"/>
  693. // ...
  694. // Because the redirect tag is built differently, we need special handling for that case.
  695. if ( $tag == 'redirect' ) {
  696. $pageInfo[$tag] = $this->nodeAttribute( 'title' );
  697. } else {
  698. $pageInfo[$tag] = $this->nodeContents();
  699. }
  700. } elseif ( $tag == 'revision' || $tag == 'upload' ) {
  701. if ( !isset( $title ) ) {
  702. $title = $this->processTitle( $pageInfo['title'],
  703. $pageInfo['ns'] ?? null );
  704. // $title is either an array of two titles or false.
  705. if ( is_array( $title ) ) {
  706. $this->pageCallback( $title );
  707. list( $pageInfo['_title'], $foreignTitle ) = $title;
  708. } else {
  709. $badTitle = true;
  710. $skip = true;
  711. }
  712. }
  713. if ( $title ) {
  714. if ( $tag == 'revision' ) {
  715. $this->handleRevision( $pageInfo );
  716. } else {
  717. $this->handleUpload( $pageInfo );
  718. }
  719. }
  720. } elseif ( $tag != '#text' ) {
  721. $this->warn( "Unhandled page XML tag $tag" );
  722. $skip = true;
  723. }
  724. }
  725. // @note $pageInfo is only set if a valid $title is processed above with
  726. // no error. If we have a valid $title, then pageCallback is called
  727. // above, $pageInfo['title'] is set and we do pageOutCallback here.
  728. // If $pageInfo['_title'] is not set, then $foreignTitle is also not
  729. // set since they both come from $title above.
  730. if ( array_key_exists( '_title', $pageInfo ) ) {
  731. $this->pageOutCallback( $pageInfo['_title'], $foreignTitle,
  732. $pageInfo['revisionCount'],
  733. $pageInfo['successfulRevisionCount'],
  734. $pageInfo );
  735. }
  736. }
  737. /**
  738. * @param array $pageInfo
  739. */
  740. private function handleRevision( &$pageInfo ) {
  741. $this->debug( "Enter revision handler" );
  742. $revisionInfo = [];
  743. $normalFields = [ 'id', 'timestamp', 'comment', 'minor', 'model', 'format', 'text', 'sha1' ];
  744. $skip = false;
  745. while ( $skip ? $this->reader->next() : $this->reader->read() ) {
  746. if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
  747. $this->reader->localName == 'revision' ) {
  748. break;
  749. }
  750. $tag = $this->reader->localName;
  751. if ( !Hooks::run( 'ImportHandleRevisionXMLTag', [
  752. $this, $pageInfo, $revisionInfo
  753. ] ) ) {
  754. // Do nothing
  755. } elseif ( in_array( $tag, $normalFields ) ) {
  756. $revisionInfo[$tag] = $this->nodeContents();
  757. } elseif ( $tag == 'contributor' ) {
  758. $revisionInfo['contributor'] = $this->handleContributor();
  759. } elseif ( $tag != '#text' ) {
  760. $this->warn( "Unhandled revision XML tag $tag" );
  761. $skip = true;
  762. }
  763. }
  764. $pageInfo['revisionCount']++;
  765. if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
  766. $pageInfo['successfulRevisionCount']++;
  767. }
  768. }
  769. /**
  770. * @param array $pageInfo
  771. * @param array $revisionInfo
  772. * @throws MWException
  773. * @return bool|mixed
  774. */
  775. private function processRevision( $pageInfo, $revisionInfo ) {
  776. global $wgMaxArticleSize;
  777. // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
  778. // database errors and instability. Testing for revisions with only listed
  779. // content models, as other content models might use serialization formats
  780. // which aren't checked against $wgMaxArticleSize.
  781. if ( ( !isset( $revisionInfo['model'] ) ||
  782. in_array( $revisionInfo['model'], [
  783. 'wikitext',
  784. 'css',
  785. 'json',
  786. 'javascript',
  787. 'text',
  788. ''
  789. ] ) ) &&
  790. strlen( $revisionInfo['text'] ) > $wgMaxArticleSize * 1024
  791. ) {
  792. throw new MWException( 'The text of ' .
  793. ( isset( $revisionInfo['id'] ) ?
  794. "the revision with ID $revisionInfo[id]" :
  795. 'a revision'
  796. ) . " exceeds the maximum allowable size ($wgMaxArticleSize KB)" );
  797. }
  798. // FIXME: process schema version 11!
  799. $revision = new WikiRevision( $this->config );
  800. if ( isset( $revisionInfo['id'] ) ) {
  801. $revision->setID( $revisionInfo['id'] );
  802. }
  803. if ( isset( $revisionInfo['model'] ) ) {
  804. $revision->setModel( $revisionInfo['model'] );
  805. }
  806. if ( isset( $revisionInfo['format'] ) ) {
  807. $revision->setFormat( $revisionInfo['format'] );
  808. }
  809. $revision->setTitle( $pageInfo['_title'] );
  810. if ( isset( $revisionInfo['text'] ) ) {
  811. $handler = $revision->getContentHandler();
  812. $text = $handler->importTransform(
  813. $revisionInfo['text'],
  814. $revision->getFormat() );
  815. $revision->setText( $text );
  816. }
  817. $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
  818. if ( isset( $revisionInfo['comment'] ) ) {
  819. $revision->setComment( $revisionInfo['comment'] );
  820. }
  821. if ( isset( $revisionInfo['minor'] ) ) {
  822. $revision->setMinor( true );
  823. }
  824. if ( isset( $revisionInfo['contributor']['ip'] ) ) {
  825. $revision->setUserIP( $revisionInfo['contributor']['ip'] );
  826. } elseif ( isset( $revisionInfo['contributor']['username'] ) ) {
  827. $revision->setUsername(
  828. $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
  829. );
  830. } else {
  831. $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
  832. }
  833. if ( isset( $revisionInfo['sha1'] ) ) {
  834. $revision->setSha1Base36( $revisionInfo['sha1'] );
  835. }
  836. $revision->setNoUpdates( $this->mNoUpdates );
  837. return $this->revisionCallback( $revision );
  838. }
  839. /**
  840. * @param array $pageInfo
  841. * @return mixed
  842. */
  843. private function handleUpload( &$pageInfo ) {
  844. $this->debug( "Enter upload handler" );
  845. $uploadInfo = [];
  846. $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
  847. 'src', 'size', 'sha1base36', 'archivename', 'rel' ];
  848. $skip = false;
  849. while ( $skip ? $this->reader->next() : $this->reader->read() ) {
  850. if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
  851. $this->reader->localName == 'upload' ) {
  852. break;
  853. }
  854. $tag = $this->reader->localName;
  855. if ( !Hooks::run( 'ImportHandleUploadXMLTag', [
  856. $this, $pageInfo
  857. ] ) ) {
  858. // Do nothing
  859. } elseif ( in_array( $tag, $normalFields ) ) {
  860. $uploadInfo[$tag] = $this->nodeContents();
  861. } elseif ( $tag == 'contributor' ) {
  862. $uploadInfo['contributor'] = $this->handleContributor();
  863. } elseif ( $tag == 'contents' ) {
  864. $contents = $this->nodeContents();
  865. $encoding = $this->reader->getAttribute( 'encoding' );
  866. if ( $encoding === 'base64' ) {
  867. $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
  868. $uploadInfo['isTempSrc'] = true;
  869. }
  870. } elseif ( $tag != '#text' ) {
  871. $this->warn( "Unhandled upload XML tag $tag" );
  872. $skip = true;
  873. }
  874. }
  875. if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
  876. $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
  877. if ( file_exists( $path ) ) {
  878. $uploadInfo['fileSrc'] = $path;
  879. $uploadInfo['isTempSrc'] = false;
  880. }
  881. }
  882. if ( $this->mImportUploads ) {
  883. return $this->processUpload( $pageInfo, $uploadInfo );
  884. }
  885. }
  886. /**
  887. * @param string $contents
  888. * @return string
  889. */
  890. private function dumpTemp( $contents ) {
  891. $filename = tempnam( wfTempDir(), 'importupload' );
  892. file_put_contents( $filename, $contents );
  893. return $filename;
  894. }
  895. /**
  896. * @param array $pageInfo
  897. * @param array $uploadInfo
  898. * @return mixed
  899. */
  900. private function processUpload( $pageInfo, $uploadInfo ) {
  901. $revision = new WikiRevision( $this->config );
  902. $text = $uploadInfo['text'] ?? '';
  903. $revision->setTitle( $pageInfo['_title'] );
  904. $revision->setID( $pageInfo['id'] );
  905. $revision->setTimestamp( $uploadInfo['timestamp'] );
  906. $revision->setText( $text );
  907. $revision->setFilename( $uploadInfo['filename'] );
  908. if ( isset( $uploadInfo['archivename'] ) ) {
  909. $revision->setArchiveName( $uploadInfo['archivename'] );
  910. }
  911. $revision->setSrc( $uploadInfo['src'] );
  912. if ( isset( $uploadInfo['fileSrc'] ) ) {
  913. $revision->setFileSrc( $uploadInfo['fileSrc'],
  914. !empty( $uploadInfo['isTempSrc'] ) );
  915. }
  916. if ( isset( $uploadInfo['sha1base36'] ) ) {
  917. $revision->setSha1Base36( $uploadInfo['sha1base36'] );
  918. }
  919. $revision->setSize( intval( $uploadInfo['size'] ) );
  920. $revision->setComment( $uploadInfo['comment'] );
  921. if ( isset( $uploadInfo['contributor']['ip'] ) ) {
  922. $revision->setUserIP( $uploadInfo['contributor']['ip'] );
  923. }
  924. if ( isset( $uploadInfo['contributor']['username'] ) ) {
  925. $revision->setUsername(
  926. $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
  927. );
  928. }
  929. $revision->setNoUpdates( $this->mNoUpdates );
  930. return call_user_func( $this->mUploadCallback, $revision );
  931. }
  932. /**
  933. * @return array
  934. */
  935. private function handleContributor() {
  936. $fields = [ 'id', 'ip', 'username' ];
  937. $info = [];
  938. if ( $this->reader->isEmptyElement ) {
  939. return $info;
  940. }
  941. while ( $this->reader->read() ) {
  942. if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
  943. $this->reader->localName == 'contributor' ) {
  944. break;
  945. }
  946. $tag = $this->reader->localName;
  947. if ( in_array( $tag, $fields ) ) {
  948. $info[$tag] = $this->nodeContents();
  949. }
  950. }
  951. return $info;
  952. }
  953. /**
  954. * @param string $text
  955. * @param string|null $ns
  956. * @return array|bool
  957. */
  958. private function processTitle( $text, $ns = null ) {
  959. if ( is_null( $this->foreignNamespaces ) ) {
  960. $foreignTitleFactory = new NaiveForeignTitleFactory();
  961. } else {
  962. $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
  963. $this->foreignNamespaces );
  964. }
  965. $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
  966. intval( $ns ) );
  967. $title = $this->importTitleFactory->createTitleFromForeignTitle(
  968. $foreignTitle );
  969. $commandLineMode = $this->config->get( 'CommandLineMode' );
  970. if ( is_null( $title ) ) {
  971. # Invalid page title? Ignore the page
  972. $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
  973. return false;
  974. } elseif ( $title->isExternal() ) {
  975. $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
  976. return false;
  977. } elseif ( !$title->canExist() ) {
  978. $this->notice( 'import-error-special', $title->getPrefixedText() );
  979. return false;
  980. } elseif ( !$commandLineMode ) {
  981. $permissionManager = MediaWikiServices::getInstance()->getPermissionManager();
  982. $user = RequestContext::getMain()->getUser();
  983. if ( !$permissionManager->userCan( 'edit', $user, $title ) ) {
  984. # Do not import if the importing wiki user cannot edit this page
  985. $this->notice( 'import-error-edit', $title->getPrefixedText() );
  986. return false;
  987. }
  988. if ( !$title->exists() && !$permissionManager->userCan( 'create', $user, $title ) ) {
  989. # Do not import if the importing wiki user cannot create this page
  990. $this->notice( 'import-error-create', $title->getPrefixedText() );
  991. return false;
  992. }
  993. }
  994. return [ $title, $foreignTitle ];
  995. }
  996. }