RefreshLinksJob.php 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. <?php
  2. /**
  3. * Job to update link tables for pages
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along
  16. * with this program; if not, write to the Free Software Foundation, Inc.,
  17. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. * http://www.gnu.org/copyleft/gpl.html
  19. *
  20. * @file
  21. * @ingroup JobQueue
  22. */
  23. use MediaWiki\MediaWikiServices;
  24. use MediaWiki\Revision\RevisionRecord;
  25. use MediaWiki\Revision\RevisionRenderer;
  26. use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface;
  27. /**
  28. * Job to update link tables for pages
  29. *
  30. * This job comes in a few variants:
  31. * - a) Recursive jobs to update links for backlink pages for a given title.
  32. * These jobs have (recursive:true,table:<table>) set.
  33. * - b) Jobs to update links for a set of pages (the job title is ignored).
  34. * These jobs have (pages:(<page ID>:(<namespace>,<title>),...) set.
  35. * - c) Jobs to update links for a single page (the job title)
  36. * These jobs need no extra fields set.
  37. *
  38. * @ingroup JobQueue
  39. */
  40. class RefreshLinksJob extends Job {
  41. /** @var int Lag safety margin when comparing root job times to last-refresh times */
  42. const NORMAL_MAX_LAG = 10;
  43. /** @var int How many seconds to wait for replica DBs to catch up */
  44. const LAG_WAIT_TIMEOUT = 15;
  45. function __construct( Title $title, array $params ) {
  46. parent::__construct( 'refreshLinks', $title, $params );
  47. // Avoid the overhead of de-duplication when it would be pointless
  48. $this->removeDuplicates = (
  49. // Ranges rarely will line up
  50. !isset( $params['range'] ) &&
  51. // Multiple pages per job make matches unlikely
  52. !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
  53. );
  54. $this->params += [ 'causeAction' => 'unknown', 'causeAgent' => 'unknown' ];
  55. // Tell JobRunner to not automatically wrap run() in a transaction round.
  56. // Each runForTitle() call will manage its own rounds in order to run DataUpdates
  57. // and to avoid contention as well.
  58. $this->executionFlags |= self::JOB_NO_EXPLICIT_TRX_ROUND;
  59. }
  60. /**
  61. * @param Title $title
  62. * @param array $params
  63. * @return RefreshLinksJob
  64. */
  65. public static function newPrioritized( Title $title, array $params ) {
  66. $job = new self( $title, $params );
  67. $job->command = 'refreshLinksPrioritized';
  68. return $job;
  69. }
  70. /**
  71. * @param Title $title
  72. * @param array $params
  73. * @return RefreshLinksJob
  74. */
  75. public static function newDynamic( Title $title, array $params ) {
  76. $job = new self( $title, $params );
  77. $job->command = 'refreshLinksDynamic';
  78. return $job;
  79. }
  80. function run() {
  81. $ok = true;
  82. // Job to update all (or a range of) backlink pages for a page
  83. if ( !empty( $this->params['recursive'] ) ) {
  84. $services = MediaWikiServices::getInstance();
  85. // When the base job branches, wait for the replica DBs to catch up to the master.
  86. // From then on, we know that any template changes at the time the base job was
  87. // enqueued will be reflected in backlink page parses when the leaf jobs run.
  88. if ( !isset( $this->params['range'] ) ) {
  89. $lbFactory = $services->getDBLoadBalancerFactory();
  90. if ( !$lbFactory->waitForReplication( [
  91. 'domain' => $lbFactory->getLocalDomainID(),
  92. 'timeout' => self::LAG_WAIT_TIMEOUT
  93. ] ) ) { // only try so hard
  94. $stats = $services->getStatsdDataFactory();
  95. $stats->increment( 'refreshlinks.lag_wait_failed' );
  96. }
  97. }
  98. // Carry over information for de-duplication
  99. $extraParams = $this->getRootJobParams();
  100. $extraParams['triggeredRecursive'] = true;
  101. // Carry over cause information for logging
  102. $extraParams['causeAction'] = $this->params['causeAction'];
  103. $extraParams['causeAgent'] = $this->params['causeAgent'];
  104. // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
  105. // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
  106. $jobs = BacklinkJobUtils::partitionBacklinkJob(
  107. $this,
  108. $services->getMainConfig()->get( 'UpdateRowsPerJob' ),
  109. 1, // job-per-title
  110. [ 'params' => $extraParams ]
  111. );
  112. JobQueueGroup::singleton()->push( $jobs );
  113. // Job to update link tables for a set of titles
  114. } elseif ( isset( $this->params['pages'] ) ) {
  115. foreach ( $this->params['pages'] as list( $ns, $dbKey ) ) {
  116. $title = Title::makeTitleSafe( $ns, $dbKey );
  117. if ( $title ) {
  118. $ok = $this->runForTitle( $title ) && $ok;
  119. } else {
  120. $ok = false;
  121. $this->setLastError( "Invalid title ($ns,$dbKey)." );
  122. }
  123. }
  124. // Job to update link tables for a given title
  125. } else {
  126. $ok = $this->runForTitle( $this->title );
  127. }
  128. return $ok;
  129. }
  130. /**
  131. * @param Title $title
  132. * @return bool
  133. */
  134. protected function runForTitle( Title $title ) {
  135. $services = MediaWikiServices::getInstance();
  136. $stats = $services->getStatsdDataFactory();
  137. $renderer = $services->getRevisionRenderer();
  138. $parserCache = $services->getParserCache();
  139. $lbFactory = $services->getDBLoadBalancerFactory();
  140. $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
  141. // Load the page from the master DB
  142. $page = WikiPage::factory( $title );
  143. $page->loadPageData( WikiPage::READ_LATEST );
  144. // Serialize link update job by page ID so they see each others' changes.
  145. // The page ID and latest revision ID will be queried again after the lock
  146. // is acquired to bail if they are changed from that of loadPageData() above.
  147. // Serialize links updates by page ID so they see each others' changes
  148. $dbw = $lbFactory->getMainLB()->getConnectionRef( DB_MASTER );
  149. /** @noinspection PhpUnusedLocalVariableInspection */
  150. $scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
  151. if ( $scopedLock === null ) {
  152. // Another job is already updating the page, likely for a prior revision (T170596)
  153. $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
  154. $stats->increment( 'refreshlinks.lock_failure' );
  155. return false;
  156. }
  157. if ( $this->isAlreadyRefreshed( $page ) ) {
  158. $stats->increment( 'refreshlinks.update_skipped' );
  159. return true;
  160. }
  161. // Parse during a fresh transaction round for better read consistency
  162. $lbFactory->beginMasterChanges( __METHOD__ );
  163. $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
  164. $options = $this->getDataUpdateOptions();
  165. $lbFactory->commitMasterChanges( __METHOD__ );
  166. if ( !$output ) {
  167. return false; // raced out?
  168. }
  169. // Tell DerivedPageDataUpdater to use this parser output
  170. $options['known-revision-output'] = $output;
  171. // Execute corresponding DataUpdates immediately
  172. $page->doSecondaryDataUpdates( $options );
  173. InfoAction::invalidateCache( $title );
  174. // Commit any writes here in case this method is called in a loop.
  175. // In that case, the scoped lock will fail to be acquired.
  176. $lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );
  177. return true;
  178. }
  179. /**
  180. * @param WikiPage $page
  181. * @return bool Whether something updated the backlinks with data newer than this job
  182. */
  183. private function isAlreadyRefreshed( WikiPage $page ) {
  184. // Get the timestamp of the change that triggered this job
  185. $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
  186. if ( $rootTimestamp === null ) {
  187. return false;
  188. }
  189. if ( !empty( $this->params['isOpportunistic'] ) ) {
  190. // Neither clock skew nor DB snapshot/replica DB lag matter much for
  191. // such updates; focus on reusing the (often recently updated) cache
  192. $lagAwareTimestamp = $rootTimestamp;
  193. } else {
  194. // For transclusion updates, the template changes must be reflected
  195. $lagAwareTimestamp = wfTimestamp(
  196. TS_MW,
  197. wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
  198. );
  199. }
  200. return ( $page->getLinksTimestamp() > $lagAwareTimestamp );
  201. }
  202. /**
  203. * Get the parser output if the page is unchanged from what was loaded in $page
  204. *
  205. * @param RevisionRenderer $renderer
  206. * @param ParserCache $parserCache
  207. * @param WikiPage $page Page already loaded with READ_LATEST
  208. * @param StatsdDataFactoryInterface $stats
  209. * @return ParserOutput|null Combined output for all slots; might only contain metadata
  210. */
  211. private function getParserOutput(
  212. RevisionRenderer $renderer,
  213. ParserCache $parserCache,
  214. WikiPage $page,
  215. StatsdDataFactoryInterface $stats
  216. ) {
  217. $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
  218. if ( !$revision ) {
  219. return null; // race condition?
  220. }
  221. $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
  222. if ( $cachedOutput ) {
  223. return $cachedOutput;
  224. }
  225. $renderedRevision = $renderer->getRenderedRevision(
  226. $revision,
  227. $page->makeParserOptions( 'canonical' ),
  228. null,
  229. [ 'audience' => $revision::RAW ]
  230. );
  231. $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
  232. $output = $renderedRevision->getRevisionParserOutput( [ 'generate-html' => false ] );
  233. $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
  234. return $output;
  235. }
  236. /**
  237. * Get the current revision record if it is unchanged from what was loaded in $page
  238. *
  239. * @param WikiPage $page Page already loaded with READ_LATEST
  240. * @param StatsdDataFactoryInterface $stats
  241. * @return RevisionRecord|null The same instance that $page->getRevisionRecord() uses
  242. */
  243. private function getCurrentRevisionIfUnchanged(
  244. WikiPage $page,
  245. StatsdDataFactoryInterface $stats
  246. ) {
  247. $title = $page->getTitle();
  248. // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
  249. // This is used to detect edits/moves after loadPageData() but before the scope lock.
  250. // The works around the chicken/egg problem of determining the scope lock key name
  251. $latest = $title->getLatestRevID( Title::READ_LATEST );
  252. $triggeringRevisionId = $this->params['triggeringRevisionId'] ?? null;
  253. if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
  254. // This job is obsolete and one for the latest revision will handle updates
  255. $stats->increment( 'refreshlinks.rev_not_current' );
  256. $this->setLastError( "Revision $triggeringRevisionId is not current" );
  257. return null;
  258. }
  259. // Load the current revision. Note that $page should have loaded with READ_LATEST.
  260. // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
  261. $revision = $page->getRevisionRecord();
  262. if ( !$revision ) {
  263. $stats->increment( 'refreshlinks.rev_not_found' );
  264. $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
  265. return null; // just deleted?
  266. } elseif ( $revision->getId() !== $latest || $revision->getPageId() !== $page->getId() ) {
  267. // Do not clobber over newer updates with older ones. If all jobs where FIFO and
  268. // serialized, it would be OK to update links based on older revisions since it
  269. // would eventually get to the latest. Since that is not the case (by design),
  270. // only update the link tables to a state matching the current revision's output.
  271. $stats->increment( 'refreshlinks.rev_not_current' );
  272. $this->setLastError( "Revision {$revision->getId()} is not current" );
  273. return null;
  274. }
  275. return $revision;
  276. }
  277. /**
  278. * Get the parser output from cache if it reflects the change that triggered this job
  279. *
  280. * @param ParserCache $parserCache
  281. * @param WikiPage $page
  282. * @param RevisionRecord $currentRevision
  283. * @param StatsdDataFactoryInterface $stats
  284. * @return ParserOutput|null
  285. */
  286. private function getParserOutputFromCache(
  287. ParserCache $parserCache,
  288. WikiPage $page,
  289. RevisionRecord $currentRevision,
  290. StatsdDataFactoryInterface $stats
  291. ) {
  292. $cachedOutput = null;
  293. // If page_touched changed after this root job, then it is likely that
  294. // any views of the pages already resulted in re-parses which are now in
  295. // cache. The cache can be reused to avoid expensive parsing in some cases.
  296. $rootTimestamp = $this->params['rootJobTimestamp'] ?? null;
  297. if ( $rootTimestamp !== null ) {
  298. $opportunistic = !empty( $this->params['isOpportunistic'] );
  299. if ( $opportunistic ) {
  300. // Neither clock skew nor DB snapshot/replica DB lag matter much for
  301. // such updates; focus on reusing the (often recently updated) cache
  302. $lagAwareTimestamp = $rootTimestamp;
  303. } else {
  304. // For transclusion updates, the template changes must be reflected
  305. $lagAwareTimestamp = wfTimestamp(
  306. TS_MW,
  307. wfTimestamp( TS_UNIX, $rootTimestamp ) + self::NORMAL_MAX_LAG
  308. );
  309. }
  310. if ( $page->getTouched() >= $rootTimestamp || $opportunistic ) {
  311. // Cache is suspected to be up-to-date so it's worth the I/O of checking.
  312. // As long as the cache rev ID matches the current rev ID and it reflects
  313. // the job's triggering change, then it is usable.
  314. $parserOptions = $page->makeParserOptions( 'canonical' );
  315. $output = $parserCache->getDirty( $page, $parserOptions );
  316. if (
  317. $output &&
  318. $output->getCacheRevisionId() == $currentRevision->getId() &&
  319. $output->getCacheTime() >= $lagAwareTimestamp
  320. ) {
  321. $cachedOutput = $output;
  322. }
  323. }
  324. }
  325. if ( $cachedOutput ) {
  326. $stats->increment( 'refreshlinks.parser_cached' );
  327. } else {
  328. $stats->increment( 'refreshlinks.parser_uncached' );
  329. }
  330. return $cachedOutput;
  331. }
  332. /**
  333. * @return array
  334. */
  335. private function getDataUpdateOptions() {
  336. $options = [
  337. 'recursive' => !empty( $this->params['useRecursiveLinksUpdate'] ),
  338. // Carry over cause so the update can do extra logging
  339. 'causeAction' => $this->params['causeAction'],
  340. 'causeAgent' => $this->params['causeAgent']
  341. ];
  342. if ( !empty( $this->params['triggeringUser'] ) ) {
  343. $userInfo = $this->params['triggeringUser'];
  344. if ( $userInfo['userId'] ) {
  345. $options['triggeringUser'] = User::newFromId( $userInfo['userId'] );
  346. } else {
  347. // Anonymous, use the username
  348. $options['triggeringUser'] = User::newFromName( $userInfo['userName'], false );
  349. }
  350. }
  351. return $options;
  352. }
  353. public function getDeduplicationInfo() {
  354. $info = parent::getDeduplicationInfo();
  355. unset( $info['causeAction'] );
  356. unset( $info['causeAgent'] );
  357. if ( is_array( $info['params'] ) ) {
  358. // For per-pages jobs, the job title is that of the template that changed
  359. // (or similar), so remove that since it ruins duplicate detection
  360. if ( isset( $info['params']['pages'] ) ) {
  361. unset( $info['namespace'] );
  362. unset( $info['title'] );
  363. }
  364. }
  365. return $info;
  366. }
  367. public function workItemCount() {
  368. if ( !empty( $this->params['recursive'] ) ) {
  369. return 0; // nothing actually refreshed
  370. } elseif ( isset( $this->params['pages'] ) ) {
  371. return count( $this->params['pages'] );
  372. }
  373. return 1; // one title
  374. }
  375. }