SearchHighlighter.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588
  1. <?php
  2. /**
  3. * Basic search engine highlighting
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along
  16. * with this program; if not, write to the Free Software Foundation, Inc.,
  17. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. * http://www.gnu.org/copyleft/gpl.html
  19. *
  20. * @file
  21. * @ingroup Search
  22. */
  23. use MediaWiki\MediaWikiServices;
  24. /**
  25. * Highlight bits of wikitext
  26. *
  27. * @ingroup Search
  28. */
  29. class SearchHighlighter {
  30. const DEFAULT_CONTEXT_LINES = 2;
  31. const DEFAULT_CONTEXT_CHARS = 75;
  32. protected $mCleanWikitext = true;
  33. /**
  34. * @warning If you pass false to this constructor, then
  35. * the caller is responsible for HTML escaping.
  36. * @param bool $cleanupWikitext
  37. */
  38. function __construct( $cleanupWikitext = true ) {
  39. $this->mCleanWikitext = $cleanupWikitext;
  40. }
  41. /**
  42. * Wikitext highlighting when $wgAdvancedSearchHighlighting = true
  43. *
  44. * @param string $text
  45. * @param string[] $terms Terms to highlight (not html escaped but
  46. * regex escaped via SearchDatabase::regexTerm())
  47. * @param int $contextlines
  48. * @param int $contextchars
  49. * @return string
  50. */
  51. public function highlightText(
  52. $text,
  53. $terms,
  54. $contextlines = self::DEFAULT_CONTEXT_LINES,
  55. $contextchars = self::DEFAULT_CONTEXT_CHARS
  56. ) {
  57. global $wgSearchHighlightBoundaries;
  58. if ( $text == '' ) {
  59. return '';
  60. }
  61. // spli text into text + templates/links/tables
  62. $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
  63. // first capture group is for detecting nested templates/links/tables/references
  64. $endPatterns = [
  65. 1 => '/(\{\{)|(\}\})/', // template
  66. 2 => '/(\[\[)|(\]\])/', // image
  67. 3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
  68. // @todo FIXME: This should prolly be a hook or something
  69. // instead of hardcoding the name of the Cite extension
  70. if ( \ExtensionRegistry::getInstance()->isLoaded( 'Cite' ) ) {
  71. $spat .= '|(<ref>)'; // references via cite extension
  72. $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
  73. }
  74. $spat .= '/';
  75. $textExt = []; // text extracts
  76. $otherExt = []; // other extracts
  77. $start = 0;
  78. $textLen = strlen( $text );
  79. $count = 0; // sequence number to maintain ordering
  80. while ( $start < $textLen ) {
  81. // find start of template/image/table
  82. if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
  83. $epat = '';
  84. foreach ( $matches as $key => $val ) {
  85. if ( $key > 0 && $val[1] != -1 ) {
  86. if ( $key == 2 ) {
  87. // see if this is an image link
  88. $ns = substr( $val[0], 2, -1 );
  89. if (
  90. MediaWikiServices::getInstance()->getContentLanguage()->
  91. getNsIndex( $ns ) != NS_FILE
  92. ) {
  93. break;
  94. }
  95. }
  96. $epat = $endPatterns[$key];
  97. $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
  98. $start = $val[1];
  99. break;
  100. }
  101. }
  102. if ( $epat ) {
  103. // find end (and detect any nested elements)
  104. $level = 0;
  105. $offset = $start + 1;
  106. $found = false;
  107. while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
  108. if ( array_key_exists( 2, $endMatches ) ) {
  109. // found end
  110. if ( $level == 0 ) {
  111. $len = strlen( $endMatches[2][0] );
  112. $off = $endMatches[2][1];
  113. $this->splitAndAdd( $otherExt, $count,
  114. substr( $text, $start, $off + $len - $start ) );
  115. $start = $off + $len;
  116. $found = true;
  117. break;
  118. } else {
  119. // end of nested element
  120. $level -= 1;
  121. }
  122. } else {
  123. // nested
  124. $level += 1;
  125. }
  126. $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
  127. }
  128. if ( !$found ) {
  129. // couldn't find appropriate closing tag, skip
  130. $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
  131. $start += strlen( $matches[0][0] );
  132. }
  133. continue;
  134. }
  135. }
  136. // else: add as text extract
  137. $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
  138. break;
  139. }
  140. $all = $textExt + $otherExt; // these have disjunct key sets
  141. // prepare regexps
  142. foreach ( $terms as $index => $term ) {
  143. // manually do upper/lowercase stuff for utf-8 since PHP won't do it
  144. if ( preg_match( '/[\x80-\xff]/', $term ) ) {
  145. $terms[$index] = preg_replace_callback(
  146. '/./us',
  147. [ $this, 'caseCallback' ],
  148. $terms[$index]
  149. );
  150. } else {
  151. $terms[$index] = $term;
  152. }
  153. }
  154. $anyterm = implode( '|', $terms );
  155. $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
  156. // @todo FIXME: A hack to scale contextchars, a correct solution
  157. // would be to have contextchars actually be char and not byte
  158. // length, and do proper utf-8 substrings and lengths everywhere,
  159. // but PHP is making that very hard and unclean to implement :(
  160. $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
  161. $contextchars = intval( $contextchars * $scale );
  162. $patPre = "(^|$wgSearchHighlightBoundaries)";
  163. $patPost = "($wgSearchHighlightBoundaries|$)";
  164. $pat1 = "/(" . $phrase . ")/ui";
  165. $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
  166. $left = $contextlines;
  167. $snippets = [];
  168. $offsets = [];
  169. // show beginning only if it contains all words
  170. $first = 0;
  171. $firstText = '';
  172. foreach ( $textExt as $index => $line ) {
  173. if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
  174. $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
  175. $first = $index;
  176. break;
  177. }
  178. }
  179. if ( $firstText ) {
  180. $succ = true;
  181. // check if first text contains all terms
  182. foreach ( $terms as $term ) {
  183. if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
  184. $succ = false;
  185. break;
  186. }
  187. }
  188. if ( $succ ) {
  189. $snippets[$first] = $firstText;
  190. $offsets[$first] = 0;
  191. }
  192. }
  193. if ( !$snippets ) {
  194. // match whole query on text
  195. $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
  196. // match whole query on templates/tables/images
  197. $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
  198. // match any words on text
  199. $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
  200. // match any words on templates/tables/images
  201. $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
  202. ksort( $snippets );
  203. }
  204. // add extra chars to each snippet to make snippets constant size
  205. $extended = [];
  206. if ( count( $snippets ) == 0 ) {
  207. // couldn't find the target words, just show beginning of article
  208. if ( array_key_exists( $first, $all ) ) {
  209. $targetchars = $contextchars * $contextlines;
  210. $snippets[$first] = '';
  211. $offsets[$first] = 0;
  212. }
  213. } else {
  214. // if begin of the article contains the whole phrase, show only that !!
  215. if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
  216. && $offsets[$first] < $contextchars * 2 ) {
  217. $snippets = [ $first => $snippets[$first] ];
  218. }
  219. // calc by how much to extend existing snippets
  220. $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
  221. }
  222. foreach ( $snippets as $index => $line ) {
  223. $extended[$index] = $line;
  224. $len = strlen( $line );
  225. if ( $len < $targetchars - 20 ) {
  226. // complete this line
  227. if ( $len < strlen( $all[$index] ) ) {
  228. $extended[$index] = $this->extract(
  229. $all[$index],
  230. $offsets[$index],
  231. $offsets[$index] + $targetchars,
  232. $offsets[$index]
  233. );
  234. $len = strlen( $extended[$index] );
  235. }
  236. // add more lines
  237. $add = $index + 1;
  238. while ( $len < $targetchars - 20
  239. && array_key_exists( $add, $all )
  240. && !array_key_exists( $add, $snippets ) ) {
  241. $offsets[$add] = 0;
  242. $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
  243. $extended[$add] = $tt;
  244. $len += strlen( $tt );
  245. $add++;
  246. }
  247. }
  248. }
  249. // $snippets = array_map( 'htmlspecialchars', $extended );
  250. $snippets = $extended;
  251. $last = -1;
  252. $extract = '';
  253. foreach ( $snippets as $index => $line ) {
  254. if ( $last == -1 ) {
  255. $extract .= $line; // first line
  256. } elseif ( $last + 1 == $index
  257. && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
  258. ) {
  259. $extract .= " " . $line; // continous lines
  260. } else {
  261. $extract .= '<b> ... </b>' . $line;
  262. }
  263. $last = $index;
  264. }
  265. if ( $extract ) {
  266. $extract .= '<b> ... </b>';
  267. }
  268. $processed = [];
  269. foreach ( $terms as $term ) {
  270. if ( !isset( $processed[$term] ) ) {
  271. $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
  272. $extract = preg_replace( $pat3,
  273. "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
  274. $processed[$term] = true;
  275. }
  276. }
  277. return $extract;
  278. }
  279. /**
  280. * Split text into lines and add it to extracts array
  281. *
  282. * @param array &$extracts Index -> $line
  283. * @param int &$count
  284. * @param string $text
  285. */
  286. function splitAndAdd( &$extracts, &$count, $text ) {
  287. $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
  288. foreach ( $split as $line ) {
  289. $tt = trim( $line );
  290. if ( $tt ) {
  291. $extracts[$count++] = $tt;
  292. }
  293. }
  294. }
  295. /**
  296. * Do manual case conversion for non-ascii chars
  297. *
  298. * @param array $matches
  299. * @return string
  300. */
  301. function caseCallback( $matches ) {
  302. if ( strlen( $matches[0] ) > 1 ) {
  303. $contLang = MediaWikiServices::getInstance()->getContentLanguage();
  304. return '[' . $contLang->lc( $matches[0] ) .
  305. $contLang->uc( $matches[0] ) . ']';
  306. } else {
  307. return $matches[0];
  308. }
  309. }
  310. /**
  311. * Extract part of the text from start to end, but by
  312. * not chopping up words
  313. * @param string $text
  314. * @param int $start
  315. * @param int $end
  316. * @param int|null &$posStart (out) actual start position
  317. * @param int|null &$posEnd (out) actual end position
  318. * @return string
  319. */
  320. function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
  321. if ( $start != 0 ) {
  322. $start = $this->position( $text, $start, 1 );
  323. }
  324. if ( $end >= strlen( $text ) ) {
  325. $end = strlen( $text );
  326. } else {
  327. $end = $this->position( $text, $end );
  328. }
  329. if ( !is_null( $posStart ) ) {
  330. $posStart = $start;
  331. }
  332. if ( !is_null( $posEnd ) ) {
  333. $posEnd = $end;
  334. }
  335. if ( $end > $start ) {
  336. return substr( $text, $start, $end - $start );
  337. } else {
  338. return '';
  339. }
  340. }
  341. /**
  342. * Find a nonletter near a point (index) in the text
  343. *
  344. * @param string $text
  345. * @param int $point
  346. * @param int $offset Offset to found index
  347. * @return int Nearest nonletter index, or beginning of utf8 char if none
  348. */
  349. function position( $text, $point, $offset = 0 ) {
  350. $tolerance = 10;
  351. $s = max( 0, $point - $tolerance );
  352. $l = min( strlen( $text ), $point + $tolerance ) - $s;
  353. $m = [];
  354. if ( preg_match(
  355. '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
  356. substr( $text, $s, $l ),
  357. $m,
  358. PREG_OFFSET_CAPTURE
  359. ) ) {
  360. return $m[0][1] + $s + $offset;
  361. } else {
  362. // check if point is on a valid first UTF8 char
  363. $char = ord( $text[$point] );
  364. while ( $char >= 0x80 && $char < 0xc0 ) {
  365. // skip trailing bytes
  366. $point++;
  367. if ( $point >= strlen( $text ) ) {
  368. return strlen( $text );
  369. }
  370. $char = ord( $text[$point] );
  371. }
  372. return $point;
  373. }
  374. }
  375. /**
  376. * Search extracts for a pattern, and return snippets
  377. *
  378. * @param string $pattern Regexp for matching lines
  379. * @param array $extracts Extracts to search
  380. * @param int &$linesleft Number of extracts to make
  381. * @param int &$contextchars Length of snippet
  382. * @param array &$out Map for highlighted snippets
  383. * @param array &$offsets Map of starting points of snippets
  384. * @protected
  385. */
  386. function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
  387. if ( $linesleft == 0 ) {
  388. return; // nothing to do
  389. }
  390. foreach ( $extracts as $index => $line ) {
  391. if ( array_key_exists( $index, $out ) ) {
  392. continue; // this line already highlighted
  393. }
  394. $m = [];
  395. if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
  396. continue;
  397. }
  398. $offset = $m[0][1];
  399. $len = strlen( $m[0][0] );
  400. if ( $offset + $len < $contextchars ) {
  401. $begin = 0;
  402. } elseif ( $len > $contextchars ) {
  403. $begin = $offset;
  404. } else {
  405. $begin = $offset + intval( ( $len - $contextchars ) / 2 );
  406. }
  407. $end = $begin + $contextchars;
  408. $posBegin = $begin;
  409. // basic snippet from this line
  410. $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
  411. $offsets[$index] = $posBegin;
  412. $linesleft--;
  413. if ( $linesleft == 0 ) {
  414. return;
  415. }
  416. }
  417. }
  418. /**
  419. * Basic wikitext removal
  420. * @protected
  421. * @param string $text
  422. * @return mixed
  423. */
  424. function removeWiki( $text ) {
  425. $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
  426. $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
  427. $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
  428. $text = preg_replace_callback(
  429. "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
  430. [ $this, 'linkReplace' ],
  431. $text
  432. );
  433. $text = preg_replace( "/<\/?[^>]+>/", "", $text );
  434. $text = preg_replace( "/'''''/", "", $text );
  435. $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
  436. $text = preg_replace( "/''/", "", $text );
  437. // Note, the previous /<\/?[^>]+>/ is insufficient
  438. // for XSS safety as the HTML tag can span multiple
  439. // search results (T144845).
  440. $text = Sanitizer::escapeHtmlAllowEntities( $text );
  441. return $text;
  442. }
  443. /**
  444. * callback to replace [[target|caption]] kind of links, if
  445. * the target is category or image, leave it
  446. *
  447. * @param array $matches
  448. * @return string
  449. */
  450. function linkReplace( $matches ) {
  451. $colon = strpos( $matches[1], ':' );
  452. if ( $colon === false ) {
  453. return $matches[2]; // replace with caption
  454. }
  455. $ns = substr( $matches[1], 0, $colon );
  456. $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
  457. if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
  458. return $matches[0]; // return the whole thing
  459. } else {
  460. return $matches[2];
  461. }
  462. }
  463. /**
  464. * Simple & fast snippet extraction, but gives completely unrelevant
  465. * snippets
  466. *
  467. * Used when $wgAdvancedSearchHighlighting is false.
  468. *
  469. * @param string $text
  470. * @param string[] $terms Escaped for regex by SearchDatabase::regexTerm()
  471. * @param int $contextlines
  472. * @param int $contextchars
  473. * @return string
  474. */
  475. public function highlightSimple(
  476. $text,
  477. $terms,
  478. $contextlines = self::DEFAULT_CONTEXT_LINES,
  479. $contextchars = self::DEFAULT_CONTEXT_CHARS
  480. ) {
  481. $lines = explode( "\n", $text );
  482. $terms = implode( '|', $terms );
  483. $max = intval( $contextchars ) + 1;
  484. $pat1 = "/(.*)($terms)(.{0,$max})/i";
  485. $lineno = 0;
  486. $extract = "";
  487. $contLang = MediaWikiServices::getInstance()->getContentLanguage();
  488. foreach ( $lines as $line ) {
  489. if ( $contextlines == 0 ) {
  490. break;
  491. }
  492. ++$lineno;
  493. $m = [];
  494. if ( !preg_match( $pat1, $line, $m ) ) {
  495. continue;
  496. }
  497. --$contextlines;
  498. // truncate function changes ... to relevant i18n message.
  499. $pre = $contLang->truncateForVisual( $m[1], - $contextchars, '...', false );
  500. if ( count( $m ) < 3 ) {
  501. $post = '';
  502. } else {
  503. $post = $contLang->truncateForVisual( $m[3], $contextchars, '...', false );
  504. }
  505. $found = $m[2];
  506. $line = htmlspecialchars( $pre . $found . $post );
  507. $pat2 = '/(' . $terms . ")/i";
  508. $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
  509. $extract .= "${line}\n";
  510. }
  511. return $extract;
  512. }
  513. /**
  514. * Returns the first few lines of the text
  515. *
  516. * @param string $text
  517. * @param int $contextlines Max number of returned lines
  518. * @param int $contextchars Average number of characters per line
  519. * @return string
  520. */
  521. public function highlightNone(
  522. $text,
  523. $contextlines = self::DEFAULT_CONTEXT_LINES,
  524. $contextchars = self::DEFAULT_CONTEXT_CHARS
  525. ) {
  526. $match = [];
  527. $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
  528. $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
  529. preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
  530. // Trim and limit to max number of chars
  531. $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
  532. return str_replace( "\n", '<br>', $text );
  533. }
  534. }