StringUtils.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. <?php
  2. use Wikimedia\AtEase\AtEase;
  3. /**
  4. * Methods to play with strings.
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License along
  17. * with this program; if not, write to the Free Software Foundation, Inc.,
  18. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  19. * http://www.gnu.org/copyleft/gpl.html
  20. *
  21. * @file
  22. */
  23. /**
  24. * A collection of static methods to play with strings.
  25. */
  26. class StringUtils {
  27. /**
  28. * Test whether a string is valid UTF-8.
  29. *
  30. * The function check for invalid byte sequences, overlong encoding but
  31. * not for different normalisations.
  32. *
  33. * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
  34. * In particular, the pure PHP code path did not in fact check for overlong forms.
  35. * Beware of this when backporting code to that version of MediaWiki.
  36. *
  37. * @since 1.21
  38. * @param string $value String to check
  39. * @return bool Whether the given $value is a valid UTF-8 encoded string
  40. */
  41. static function isUtf8( $value ) {
  42. return mb_check_encoding( (string)$value, 'UTF-8' );
  43. }
  44. /**
  45. * Explode a string, but ignore any instances of the separator inside
  46. * the given start and end delimiters, which may optionally nest.
  47. * The delimiters are literal strings, not regular expressions.
  48. * @param string $startDelim Start delimiter
  49. * @param string $endDelim End delimiter
  50. * @param string $separator Separator string for the explode.
  51. * @param string $subject Subject string to explode.
  52. * @param bool $nested True iff the delimiters are allowed to nest.
  53. * @return ArrayIterator
  54. */
  55. static function delimiterExplode( $startDelim, $endDelim, $separator,
  56. $subject, $nested = false ) {
  57. $inputPos = 0;
  58. $lastPos = 0;
  59. $depth = 0;
  60. $encStart = preg_quote( $startDelim, '!' );
  61. $encEnd = preg_quote( $endDelim, '!' );
  62. $encSep = preg_quote( $separator, '!' );
  63. $len = strlen( $subject );
  64. $m = [];
  65. $exploded = [];
  66. while (
  67. $inputPos < $len &&
  68. preg_match(
  69. "!$encStart|$encEnd|$encSep!S", $subject, $m,
  70. PREG_OFFSET_CAPTURE, $inputPos
  71. )
  72. ) {
  73. $match = $m[0][0];
  74. $matchPos = $m[0][1];
  75. $inputPos = $matchPos + strlen( $match );
  76. if ( $match === $separator ) {
  77. if ( $depth === 0 ) {
  78. $exploded[] = substr(
  79. $subject, $lastPos, $matchPos - $lastPos
  80. );
  81. $lastPos = $inputPos;
  82. }
  83. } elseif ( $match === $startDelim ) {
  84. if ( $depth === 0 || $nested ) {
  85. $depth++;
  86. }
  87. } else {
  88. $depth--;
  89. }
  90. }
  91. $exploded[] = substr( $subject, $lastPos );
  92. // This method could be rewritten in the future to avoid creating an
  93. // intermediate array, since the return type is just an iterator.
  94. return new ArrayIterator( $exploded );
  95. }
  96. /**
  97. * Perform an operation equivalent to `preg_replace()`
  98. *
  99. * Matches this code:
  100. *
  101. * preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
  102. *
  103. * ..except that it's worst-case O(N) instead of O(N^2). Compared to delimiterReplace(), this
  104. * implementation is fast but memory-hungry and inflexible. The memory requirements are such
  105. * that I don't recommend using it on anything but guaranteed small chunks of text.
  106. *
  107. * @param string $startDelim
  108. * @param string $endDelim
  109. * @param string $replace
  110. * @param string $subject
  111. * @return string
  112. */
  113. static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
  114. $segments = explode( $startDelim, $subject );
  115. $output = array_shift( $segments );
  116. foreach ( $segments as $s ) {
  117. $endDelimPos = strpos( $s, $endDelim );
  118. if ( $endDelimPos === false ) {
  119. $output .= $startDelim . $s;
  120. } else {
  121. $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
  122. }
  123. }
  124. return $output;
  125. }
  126. /**
  127. * Perform an operation equivalent to `preg_replace_callback()`
  128. *
  129. * Matches this code:
  130. *
  131. * preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject );
  132. *
  133. * If the start delimiter ends with an initial substring of the end delimiter,
  134. * e.g. in the case of C-style comments, the behavior differs from the model
  135. * regex. In this implementation, the end must share no characters with the
  136. * start, so e.g. `/*\/` is not considered to be both the start and end of a
  137. * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
  138. *
  139. * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
  140. * but uses far less memory. The delimiters are literal strings, not regular expressions.
  141. *
  142. * @param string $startDelim Start delimiter
  143. * @param string $endDelim End delimiter
  144. * @param callable $callback Function to call on each match
  145. * @param string $subject
  146. * @param string $flags Regular expression flags
  147. * @throws InvalidArgumentException
  148. * @return string
  149. */
  150. static function delimiterReplaceCallback( $startDelim, $endDelim, $callback,
  151. $subject, $flags = ''
  152. ) {
  153. $inputPos = 0;
  154. $outputPos = 0;
  155. $contentPos = 0;
  156. $output = '';
  157. $foundStart = false;
  158. $encStart = preg_quote( $startDelim, '!' );
  159. $encEnd = preg_quote( $endDelim, '!' );
  160. $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
  161. $endLength = strlen( $endDelim );
  162. $m = [];
  163. while ( $inputPos < strlen( $subject ) &&
  164. preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
  165. ) {
  166. $tokenOffset = $m[0][1];
  167. if ( $m[1][0] != '' ) {
  168. if ( $foundStart &&
  169. $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
  170. ) {
  171. # An end match is present at the same location
  172. $tokenType = 'end';
  173. $tokenLength = $endLength;
  174. } else {
  175. $tokenType = 'start';
  176. $tokenLength = strlen( $m[0][0] );
  177. }
  178. } elseif ( $m[2][0] != '' ) {
  179. $tokenType = 'end';
  180. $tokenLength = strlen( $m[0][0] );
  181. } else {
  182. throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
  183. }
  184. if ( $tokenType == 'start' ) {
  185. # Only move the start position if we haven't already found a start
  186. # This means that START START END matches outer pair
  187. if ( !$foundStart ) {
  188. # Found start
  189. $inputPos = $tokenOffset + $tokenLength;
  190. # Write out the non-matching section
  191. $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
  192. $outputPos = $tokenOffset;
  193. $contentPos = $inputPos;
  194. $foundStart = true;
  195. } else {
  196. # Move the input position past the *first character* of START,
  197. # to protect against missing END when it overlaps with START
  198. $inputPos = $tokenOffset + 1;
  199. }
  200. } elseif ( $tokenType == 'end' ) {
  201. if ( $foundStart ) {
  202. # Found match
  203. $output .= $callback( [
  204. substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
  205. substr( $subject, $contentPos, $tokenOffset - $contentPos )
  206. ] );
  207. $foundStart = false;
  208. } else {
  209. # Non-matching end, write it out
  210. $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
  211. }
  212. $inputPos = $outputPos = $tokenOffset + $tokenLength;
  213. } else {
  214. throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
  215. }
  216. }
  217. if ( $outputPos < strlen( $subject ) ) {
  218. $output .= substr( $subject, $outputPos );
  219. }
  220. return $output;
  221. }
  222. /**
  223. * Perform an operation equivalent to `preg_replace()` with flags.
  224. *
  225. * Matches this code:
  226. *
  227. * preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject );
  228. *
  229. * @param string $startDelim Start delimiter regular expression
  230. * @param string $endDelim End delimiter regular expression
  231. * @param string $replace Replacement string. May contain $1, which will be
  232. * replaced by the text between the delimiters
  233. * @param string $subject String to search
  234. * @param string $flags Regular expression flags
  235. * @return string The string with the matches replaced
  236. */
  237. static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
  238. return self::delimiterReplaceCallback(
  239. $startDelim, $endDelim,
  240. function ( array $matches ) use ( $replace ) {
  241. return strtr( $replace, [ '$0' => $matches[0], '$1' => $matches[1] ] );
  242. },
  243. $subject, $flags
  244. );
  245. }
  246. /**
  247. * More or less "markup-safe" explode()
  248. * Ignores any instances of the separator inside `<...>`
  249. * @param string $separator
  250. * @param string $text
  251. * @return array
  252. */
  253. static function explodeMarkup( $separator, $text ) {
  254. $placeholder = "\x00";
  255. // Remove placeholder instances
  256. $text = str_replace( $placeholder, '', $text );
  257. // Replace instances of the separator inside HTML-like tags with the placeholder
  258. $cleaned = self::delimiterReplaceCallback(
  259. '<', '>',
  260. function ( array $matches ) use ( $separator, $placeholder ) {
  261. return str_replace( $separator, $placeholder, $matches[0] );
  262. },
  263. $text
  264. );
  265. // Explode, then put the replaced separators back in
  266. $items = explode( $separator, $cleaned );
  267. foreach ( $items as $i => $str ) {
  268. $items[$i] = str_replace( $placeholder, $separator, $str );
  269. }
  270. return $items;
  271. }
  272. /**
  273. * More or less "markup-safe" str_replace()
  274. * Ignores any instances of the separator inside `<...>`
  275. * @param string $search
  276. * @param string $replace
  277. * @param string $text
  278. * @return string
  279. */
  280. static function replaceMarkup( $search, $replace, $text ) {
  281. $placeholder = "\x00";
  282. // Remove placeholder instances
  283. $text = str_replace( $placeholder, '', $text );
  284. // Replace instances of the separator inside HTML-like tags with the placeholder
  285. $cleaned = self::delimiterReplaceCallback(
  286. '<', '>',
  287. function ( array $matches ) use ( $search, $placeholder ) {
  288. return str_replace( $search, $placeholder, $matches[0] );
  289. },
  290. $text
  291. );
  292. // Explode, then put the replaced separators back in
  293. $cleaned = str_replace( $search, $replace, $cleaned );
  294. $text = str_replace( $placeholder, $search, $cleaned );
  295. return $text;
  296. }
  297. /**
  298. * Utility function to check if the given string is a valid PCRE regex. Avoids
  299. * manually calling suppressWarnings and restoreWarnings, and provides a
  300. * one-line solution without the need to use @.
  301. *
  302. * @since 1.34
  303. * @param string $string The string you want to check being a valid regex
  304. * @return bool
  305. */
  306. public static function isValidPCRERegex( $string ) {
  307. AtEase::suppressWarnings();
  308. // @phan-suppress-next-line PhanParamSuspiciousOrder False positive
  309. $isValid = preg_match( $string, '' );
  310. AtEase::restoreWarnings();
  311. return $isValid !== false;
  312. }
  313. /**
  314. * Escape a string to make it suitable for inclusion in a preg_replace()
  315. * replacement parameter.
  316. *
  317. * @param string $string
  318. * @return string
  319. */
  320. static function escapeRegexReplacement( $string ) {
  321. $string = str_replace( '\\', '\\\\', $string );
  322. $string = str_replace( '$', '\\$', $string );
  323. return $string;
  324. }
  325. /**
  326. * Workalike for explode() with limited memory usage.
  327. *
  328. * @param string $separator
  329. * @param string $subject
  330. * @return ArrayIterator|ExplodeIterator
  331. */
  332. static function explode( $separator, $subject ) {
  333. if ( substr_count( $subject, $separator ) > 1000 ) {
  334. return new ExplodeIterator( $separator, $subject );
  335. } else {
  336. return new ArrayIterator( explode( $separator, $subject ) );
  337. }
  338. }
  339. }