DiffHistoryBlob.php 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. <?php
  2. /**
  3. * Efficient concatenated text storage.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along
  16. * with this program; if not, write to the Free Software Foundation, Inc.,
  17. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. * http://www.gnu.org/copyleft/gpl.html
  19. *
  20. * @file
  21. */
  22. /**
  23. * Diff-based history compression
  24. * Requires xdiff 1.5+ and zlib
  25. */
  26. class DiffHistoryBlob implements HistoryBlob {
  27. /** @var array Uncompressed item cache */
  28. public $mItems = [];
  29. /** @var int Total uncompressed size */
  30. public $mSize = 0;
  31. /**
  32. * @var array Array of diffs. If a diff D from A to B is notated D = B - A,
  33. * and Z is an empty string:
  34. *
  35. * { item[map[i]] - item[map[i-1]] where i > 0
  36. * diff[i] = {
  37. * { item[map[i]] - Z where i = 0
  38. */
  39. public $mDiffs;
  40. /** @var array The diff map, see above */
  41. public $mDiffMap;
  42. /** @var int The key for getText()
  43. */
  44. public $mDefaultKey;
  45. /** @var string Compressed storage */
  46. public $mCompressed;
  47. /** @var bool True if the object is locked against further writes */
  48. public $mFrozen = false;
  49. /**
  50. * @var int The maximum uncompressed size before the object becomes sad
  51. * Should be less than max_allowed_packet
  52. */
  53. public $mMaxSize = 10000000;
  54. /** @var int The maximum number of text items before the object becomes sad */
  55. public $mMaxCount = 100;
  56. /** Constants from xdiff.h */
  57. const XDL_BDOP_INS = 1;
  58. const XDL_BDOP_CPY = 2;
  59. const XDL_BDOP_INSB = 3;
  60. function __construct() {
  61. if ( !function_exists( 'gzdeflate' ) ) {
  62. throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" );
  63. }
  64. }
  65. /**
  66. * @throws MWException
  67. * @param string $text
  68. * @return int
  69. */
  70. function addItem( $text ) {
  71. if ( $this->mFrozen ) {
  72. throw new MWException( __METHOD__ . ": Cannot add more items after sleep/wakeup" );
  73. }
  74. $this->mItems[] = $text;
  75. $this->mSize += strlen( $text );
  76. $this->mDiffs = null; // later
  77. return count( $this->mItems ) - 1;
  78. }
  79. /**
  80. * @param string $key
  81. * @return string
  82. */
  83. function getItem( $key ) {
  84. return $this->mItems[$key];
  85. }
  86. /**
  87. * @param string $text
  88. */
  89. function setText( $text ) {
  90. $this->mDefaultKey = $this->addItem( $text );
  91. }
  92. /**
  93. * @return string
  94. */
  95. function getText() {
  96. return $this->getItem( $this->mDefaultKey );
  97. }
  98. /**
  99. * @throws MWException
  100. */
  101. function compress() {
  102. if ( !function_exists( 'xdiff_string_rabdiff' ) ) {
  103. throw new MWException( "Need xdiff 1.5+ support to write DiffHistoryBlob\n" );
  104. }
  105. if ( isset( $this->mDiffs ) ) {
  106. // Already compressed
  107. return;
  108. }
  109. if ( $this->mItems === [] ) {
  110. return;
  111. }
  112. // Create two diff sequences: one for main text and one for small text
  113. $sequences = [
  114. 'small' => [
  115. 'tail' => '',
  116. 'diffs' => [],
  117. 'map' => [],
  118. ],
  119. 'main' => [
  120. 'tail' => '',
  121. 'diffs' => [],
  122. 'map' => [],
  123. ],
  124. ];
  125. $smallFactor = 0.5;
  126. $mItemsCount = count( $this->mItems );
  127. for ( $i = 0; $i < $mItemsCount; $i++ ) {
  128. $text = $this->mItems[$i];
  129. if ( $i == 0 ) {
  130. $seqName = 'main';
  131. } else {
  132. $mainTail = $sequences['main']['tail'];
  133. if ( strlen( $text ) < strlen( $mainTail ) * $smallFactor ) {
  134. $seqName = 'small';
  135. } else {
  136. $seqName = 'main';
  137. }
  138. }
  139. $tail = $sequences[$seqName]['tail'];
  140. $diff = $this->diff( $tail, $text );
  141. $sequences[$seqName]['diffs'][] = $diff;
  142. $sequences[$seqName]['map'][] = $i;
  143. $sequences[$seqName]['tail'] = $text;
  144. }
  145. // Knit the sequences together
  146. $tail = '';
  147. $this->mDiffs = [];
  148. $this->mDiffMap = [];
  149. foreach ( $sequences as $seq ) {
  150. if ( $seq['diffs'] === [] ) {
  151. continue;
  152. }
  153. if ( $tail === '' ) {
  154. $this->mDiffs[] = $seq['diffs'][0];
  155. } else {
  156. $head = $this->patch( '', $seq['diffs'][0] );
  157. $this->mDiffs[] = $this->diff( $tail, $head );
  158. }
  159. $this->mDiffMap[] = $seq['map'][0];
  160. $diffsCount = count( $seq['diffs'] );
  161. for ( $i = 1; $i < $diffsCount; $i++ ) {
  162. $this->mDiffs[] = $seq['diffs'][$i];
  163. $this->mDiffMap[] = $seq['map'][$i];
  164. }
  165. $tail = $seq['tail'];
  166. }
  167. }
  168. /**
  169. * @param string $t1
  170. * @param string $t2
  171. * @return string
  172. */
  173. function diff( $t1, $t2 ) {
  174. # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
  175. # "String is not zero-terminated"
  176. Wikimedia\suppressWarnings();
  177. $diff = xdiff_string_rabdiff( $t1, $t2 ) . '';
  178. Wikimedia\restoreWarnings();
  179. return $diff;
  180. }
  181. /**
  182. * @param string $base
  183. * @param string $diff
  184. * @return bool|string
  185. */
  186. function patch( $base, $diff ) {
  187. if ( function_exists( 'xdiff_string_bpatch' ) ) {
  188. Wikimedia\suppressWarnings();
  189. $text = xdiff_string_bpatch( $base, $diff ) . '';
  190. Wikimedia\restoreWarnings();
  191. return $text;
  192. }
  193. # Pure PHP implementation
  194. $header = unpack( 'Vofp/Vcsize', substr( $diff, 0, 8 ) );
  195. # Check the checksum if hash extension is available
  196. $ofp = $this->xdiffAdler32( $base );
  197. if ( $ofp !== false && $ofp !== substr( $diff, 0, 4 ) ) {
  198. wfDebug( __METHOD__ . ": incorrect base checksum\n" );
  199. return false;
  200. }
  201. if ( $header['csize'] != strlen( $base ) ) {
  202. wfDebug( __METHOD__ . ": incorrect base length\n" );
  203. return false;
  204. }
  205. $p = 8;
  206. $out = '';
  207. while ( $p < strlen( $diff ) ) {
  208. $x = unpack( 'Cop', substr( $diff, $p, 1 ) );
  209. $op = $x['op'];
  210. ++$p;
  211. switch ( $op ) {
  212. case self::XDL_BDOP_INS:
  213. $x = unpack( 'Csize', substr( $diff, $p, 1 ) );
  214. $p++;
  215. $out .= substr( $diff, $p, $x['size'] );
  216. $p += $x['size'];
  217. break;
  218. case self::XDL_BDOP_INSB:
  219. $x = unpack( 'Vcsize', substr( $diff, $p, 4 ) );
  220. $p += 4;
  221. $out .= substr( $diff, $p, $x['csize'] );
  222. $p += $x['csize'];
  223. break;
  224. case self::XDL_BDOP_CPY:
  225. $x = unpack( 'Voff/Vcsize', substr( $diff, $p, 8 ) );
  226. $p += 8;
  227. $out .= substr( $base, $x['off'], $x['csize'] );
  228. break;
  229. default:
  230. wfDebug( __METHOD__ . ": invalid op\n" );
  231. return false;
  232. }
  233. }
  234. return $out;
  235. }
  236. /**
  237. * Compute a binary "Adler-32" checksum as defined by LibXDiff, i.e. with
  238. * the bytes backwards and initialised with 0 instead of 1. See T36428.
  239. *
  240. * @param string $s
  241. * @return string|bool False if the hash extension is not available
  242. */
  243. function xdiffAdler32( $s ) {
  244. if ( !function_exists( 'hash' ) ) {
  245. return false;
  246. }
  247. static $init;
  248. if ( $init === null ) {
  249. $init = str_repeat( "\xf0", 205 ) . "\xee" . str_repeat( "\xf0", 67 ) . "\x02";
  250. }
  251. // The real Adler-32 checksum of $init is zero, so it initialises the
  252. // state to zero, as it is at the start of LibXDiff's checksum
  253. // algorithm. Appending the subject string then simulates LibXDiff.
  254. return strrev( hash( 'adler32', $init . $s, true ) );
  255. }
  256. function uncompress() {
  257. if ( !$this->mDiffs ) {
  258. return;
  259. }
  260. $tail = '';
  261. $mDiffsCount = count( $this->mDiffs );
  262. for ( $diffKey = 0; $diffKey < $mDiffsCount; $diffKey++ ) {
  263. $textKey = $this->mDiffMap[$diffKey];
  264. $text = $this->patch( $tail, $this->mDiffs[$diffKey] );
  265. $this->mItems[$textKey] = $text;
  266. $tail = $text;
  267. }
  268. }
  269. /**
  270. * @return array
  271. */
  272. function __sleep() {
  273. $this->compress();
  274. if ( $this->mItems === [] ) {
  275. $info = false;
  276. } else {
  277. // Take forward differences to improve the compression ratio for sequences
  278. $map = '';
  279. $prev = 0;
  280. foreach ( $this->mDiffMap as $i ) {
  281. if ( $map !== '' ) {
  282. $map .= ',';
  283. }
  284. $map .= $i - $prev;
  285. $prev = $i;
  286. }
  287. $info = [
  288. 'diffs' => $this->mDiffs,
  289. 'map' => $map
  290. ];
  291. }
  292. if ( isset( $this->mDefaultKey ) ) {
  293. $info['default'] = $this->mDefaultKey;
  294. }
  295. $this->mCompressed = gzdeflate( serialize( $info ) );
  296. return [ 'mCompressed' ];
  297. }
  298. function __wakeup() {
  299. // addItem() doesn't work if mItems is partially filled from mDiffs
  300. $this->mFrozen = true;
  301. $info = unserialize( gzinflate( $this->mCompressed ) );
  302. $this->mCompressed = null;
  303. if ( !$info ) {
  304. // Empty object
  305. return;
  306. }
  307. if ( isset( $info['default'] ) ) {
  308. $this->mDefaultKey = $info['default'];
  309. }
  310. $this->mDiffs = $info['diffs'];
  311. if ( isset( $info['base'] ) ) {
  312. // Old format
  313. $this->mDiffMap = range( 0, count( $this->mDiffs ) - 1 );
  314. array_unshift( $this->mDiffs,
  315. pack( 'VVCV', 0, 0, self::XDL_BDOP_INSB, strlen( $info['base'] ) ) .
  316. $info['base'] );
  317. } else {
  318. // New format
  319. $map = explode( ',', $info['map'] );
  320. $cur = 0;
  321. $this->mDiffMap = [];
  322. foreach ( $map as $i ) {
  323. $cur += $i;
  324. $this->mDiffMap[] = $cur;
  325. }
  326. }
  327. $this->uncompress();
  328. }
  329. /**
  330. * Helper function for compression jobs
  331. * Returns true until the object is "full" and ready to be committed
  332. *
  333. * @return bool
  334. */
  335. function isHappy() {
  336. return $this->mSize < $this->mMaxSize
  337. && count( $this->mItems ) < $this->mMaxCount;
  338. }
  339. }