MSCompoundFileReader.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. <?php
  2. /*
  3. * Copyright 2019 Wikimedia Foundation
  4. *
  5. * Licensed under the Apache License, Version 2.0 (the "License"); you may
  6. * not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software distributed
  12. * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
  13. * OF ANY KIND, either express or implied. See the License for the
  14. * specific language governing permissions and limitations under the License.
  15. */
  16. /**
  17. * Read the directory of a Microsoft Compound File Binary file, a.k.a. an OLE
  18. * file, and detect the MIME type.
  19. *
  20. * References:
  21. * - MS-CFB https://msdn.microsoft.com/en-us/library/dd942138.aspx
  22. * - MS-XLS https://msdn.microsoft.com/en-us/library/cc313154.aspx
  23. * - MS-PPT https://msdn.microsoft.com/en-us/library/cc313106.aspx
  24. * - MS-DOC https://msdn.microsoft.com/en-us/library/cc313153.aspx
  25. * - Python olefile https://github.com/decalage2/olefile
  26. * - OpenOffice.org's Documentation of the Microsoft Compound Document
  27. * File Format https://www.openoffice.org/sc/compdocfileformat.pdf
  28. *
  29. * @since 1.33
  30. */
  31. class MSCompoundFileReader {
  32. private $file;
  33. private $header;
  34. private $mime;
  35. private $mimeFromClsid;
  36. private $error;
  37. private $errorCode;
  38. private $valid = false;
  39. private $sectorLength;
  40. private $difat;
  41. private $fat = [];
  42. private $fileLength;
  43. const TYPE_UNALLOCATED = 0;
  44. const TYPE_STORAGE = 1;
  45. const TYPE_STREAM = 2;
  46. const TYPE_ROOT = 5;
  47. const ERROR_FILE_OPEN = 1;
  48. const ERROR_SEEK = 2;
  49. const ERROR_READ = 3;
  50. const ERROR_INVALID_SIGNATURE = 4;
  51. const ERROR_READ_PAST_END = 5;
  52. const ERROR_INVALID_FORMAT = 6;
  53. private static $mimesByClsid = [
  54. // From http://justsolve.archiveteam.org/wiki/Microsoft_Compound_File
  55. '00020810-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
  56. '00020820-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
  57. '00020906-0000-0000-C000-000000000046' => 'application/msword',
  58. '64818D10-4F9B-11CF-86EA-00AA00B929E8' => 'application/vnd.ms-powerpoint',
  59. ];
  60. /**
  61. * Read a file by name
  62. *
  63. * @param string $fileName The full path to the file
  64. * @return array An associative array of information about the file:
  65. * - valid: true if the file is valid, false otherwise
  66. * - error: An error message in English, should be present if valid=false
  67. * - errorCode: One of the self::ERROR_* constants
  68. * - mime: The MIME type detected from the directory contents
  69. * - mimeFromClsid: The MIME type detected from the CLSID on the root
  70. * directory entry
  71. */
  72. public static function readFile( $fileName ) {
  73. $handle = fopen( $fileName, 'r' );
  74. if ( $handle === false ) {
  75. return [
  76. 'valid' => false,
  77. 'error' => 'file does not exist',
  78. 'errorCode' => self::ERROR_FILE_OPEN
  79. ];
  80. }
  81. return self::readHandle( $handle );
  82. }
  83. /**
  84. * Read from an open seekable handle
  85. *
  86. * @param resource $fileHandle The file handle
  87. * @return array An associative array of information about the file:
  88. * - valid: true if the file is valid, false otherwise
  89. * - error: An error message in English, should be present if valid=false
  90. * - errorCode: One of the self::ERROR_* constants
  91. * - mime: The MIME type detected from the directory contents
  92. * - mimeFromClsid: The MIME type detected from the CLSID on the root
  93. * directory entry
  94. */
  95. public static function readHandle( $fileHandle ) {
  96. $reader = new self( $fileHandle );
  97. $info = [
  98. 'valid' => $reader->valid,
  99. 'mime' => $reader->mime,
  100. 'mimeFromClsid' => $reader->mimeFromClsid
  101. ];
  102. if ( $reader->error ) {
  103. $info['error'] = $reader->error;
  104. $info['errorCode'] = $reader->errorCode;
  105. }
  106. return $info;
  107. }
  108. private function __construct( $fileHandle ) {
  109. $this->file = $fileHandle;
  110. try {
  111. $this->init();
  112. } catch ( RuntimeException $e ) {
  113. $this->valid = false;
  114. $this->error = $e->getMessage();
  115. $this->errorCode = $e->getCode();
  116. }
  117. }
  118. private function init() {
  119. $this->header = $this->unpackOffset( 0, [
  120. 'header_signature' => 8,
  121. 'header_clsid' => 16,
  122. 'minor_version' => 2,
  123. 'major_version' => 2,
  124. 'byte_order' => 2,
  125. 'sector_shift' => 2,
  126. 'mini_sector_shift' => 2,
  127. 'reserved' => 6,
  128. 'num_dir_sectors' => 4,
  129. 'num_fat_sectors' => 4,
  130. 'first_dir_sector' => 4,
  131. 'transaction_signature_number' => 4,
  132. 'mini_stream_cutoff_size' => 4,
  133. 'first_mini_fat_sector' => 4,
  134. 'num_mini_fat_sectors' => 4,
  135. 'first_difat_sector' => 4,
  136. 'num_difat_sectors' => 4,
  137. 'difat' => 436,
  138. ] );
  139. if ( $this->header['header_signature'] !== "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" ) {
  140. $this->error( 'invalid signature: ' . bin2hex( $this->header['header_signature'] ),
  141. self::ERROR_INVALID_SIGNATURE );
  142. }
  143. // @phan-suppress-next-line PhanTypeInvalidRightOperandOfIntegerOp
  144. $this->sectorLength = 1 << $this->header['sector_shift'];
  145. $this->readDifat();
  146. $this->readDirectory();
  147. $this->valid = true;
  148. }
  149. private function sectorOffset( $sectorId ) {
  150. return $this->sectorLength * ( $sectorId + 1 );
  151. }
  152. private function decodeClsid( $binaryClsid ) {
  153. $parts = unpack( 'Va/vb/vc/C8d', $binaryClsid );
  154. return sprintf( "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X",
  155. $parts['a'],
  156. $parts['b'],
  157. $parts['c'],
  158. $parts['d1'],
  159. $parts['d2'],
  160. $parts['d3'],
  161. $parts['d4'],
  162. $parts['d5'],
  163. $parts['d6'],
  164. $parts['d7'],
  165. $parts['d8']
  166. );
  167. }
  168. /**
  169. * @param int $offset
  170. * @param int[] $struct
  171. * @return array
  172. */
  173. private function unpackOffset( $offset, $struct ) {
  174. $block = $this->readOffset( $offset, array_sum( $struct ) );
  175. return $this->unpack( $block, 0, $struct );
  176. }
  177. /**
  178. * @param string $block
  179. * @param int $offset
  180. * @param int[] $struct
  181. * @return array
  182. */
  183. private function unpack( $block, $offset, $struct ) {
  184. $data = [];
  185. foreach ( $struct as $key => $length ) {
  186. if ( $length > 4 ) {
  187. $data[$key] = substr( $block, $offset, $length );
  188. } else {
  189. $data[$key] = $this->bin2dec( $block, $offset, $length );
  190. }
  191. $offset += $length;
  192. }
  193. return $data;
  194. }
  195. private function bin2dec( $str, $offset, $length ) {
  196. $value = 0;
  197. for ( $i = $length - 1; $i >= 0; $i-- ) {
  198. $value *= 256;
  199. $value += ord( $str[$offset + $i] );
  200. }
  201. return $value;
  202. }
  203. private function readOffset( $offset, $length ) {
  204. $this->fseek( $offset );
  205. Wikimedia\suppressWarnings();
  206. $block = fread( $this->file, $length );
  207. Wikimedia\restoreWarnings();
  208. if ( $block === false ) {
  209. $this->error( 'error reading from file', self::ERROR_READ );
  210. }
  211. if ( strlen( $block ) !== $length ) {
  212. $this->error( 'unable to read the required number of bytes from the file',
  213. self::ERROR_READ_PAST_END );
  214. }
  215. return $block;
  216. }
  217. private function readSector( $sectorId ) {
  218. // @phan-suppress-next-line PhanTypeInvalidRightOperandOfIntegerOp
  219. return $this->readOffset( $this->sectorOffset( $sectorId ), 1 << $this->header['sector_shift'] );
  220. }
  221. private function error( $message, $code ) {
  222. throw new RuntimeException( $message, $code );
  223. }
  224. private function fseek( $offset ) {
  225. Wikimedia\suppressWarnings();
  226. $result = fseek( $this->file, $offset );
  227. Wikimedia\restoreWarnings();
  228. if ( $result !== 0 ) {
  229. $this->error( "unable to seek to offset $offset", self::ERROR_SEEK );
  230. }
  231. }
  232. private function readDifat() {
  233. $binaryDifat = $this->header['difat'];
  234. $nextDifatSector = $this->header['first_difat_sector'];
  235. for ( $i = 0; $i < $this->header['num_difat_sectors']; $i++ ) {
  236. $block = $this->readSector( $nextDifatSector );
  237. $binaryDifat .= substr( $block, 0, $this->sectorLength - 4 );
  238. $nextDifatSector = $this->bin2dec( $block, $this->sectorLength - 4, 4 );
  239. if ( $nextDifatSector == 0xFFFFFFFE ) {
  240. break;
  241. }
  242. }
  243. $this->difat = [];
  244. for ( $pos = 0; $pos < strlen( $binaryDifat ); $pos += 4 ) {
  245. $fatSector = $this->bin2dec( $binaryDifat, $pos, 4 );
  246. if ( $fatSector < 0xFFFFFFFC ) {
  247. $this->difat[] = $fatSector;
  248. } else {
  249. break;
  250. }
  251. }
  252. }
  253. private function getNextSectorIdFromFat( $sectorId ) {
  254. $entriesPerSector = intdiv( $this->sectorLength, 4 );
  255. $fatSectorId = intdiv( $sectorId, $entriesPerSector );
  256. $fatSectorArray = $this->getFatSector( $fatSectorId );
  257. return $fatSectorArray[$sectorId % $entriesPerSector];
  258. }
  259. private function getFatSector( $fatSectorId ) {
  260. if ( !isset( $this->fat[$fatSectorId] ) ) {
  261. $fat = [];
  262. if ( !isset( $this->difat[$fatSectorId] ) ) {
  263. $this->error( 'FAT sector requested beyond the end of the DIFAT', self::ERROR_INVALID_FORMAT );
  264. }
  265. $absoluteSectorId = $this->difat[$fatSectorId];
  266. $block = $this->readSector( $absoluteSectorId );
  267. for ( $pos = 0; $pos < strlen( $block ); $pos += 4 ) {
  268. $fat[] = $this->bin2dec( $block, $pos, 4 );
  269. }
  270. $this->fat[$fatSectorId] = $fat;
  271. }
  272. return $this->fat[$fatSectorId];
  273. }
  274. private function readDirectory() {
  275. $dirSectorId = $this->header['first_dir_sector'];
  276. $binaryDir = '';
  277. $seenSectorIds = [];
  278. while ( $dirSectorId !== 0xFFFFFFFE ) {
  279. if ( isset( $seenSectorIds[$dirSectorId] ) ) {
  280. $this->error( 'FAT loop detected', self::ERROR_INVALID_FORMAT );
  281. }
  282. $seenSectorIds[$dirSectorId] = true;
  283. $binaryDir .= $this->readSector( $dirSectorId );
  284. $dirSectorId = $this->getNextSectorIdFromFat( $dirSectorId );
  285. }
  286. $struct = [
  287. 'name_raw' => 64,
  288. 'name_length' => 2,
  289. 'object_type' => 1,
  290. 'color' => 1,
  291. 'sid_left' => 4,
  292. 'sid_right' => 4,
  293. 'sid_child' => 4,
  294. 'clsid' => 16,
  295. 'state_bits' => 4,
  296. 'create_time_low' => 4,
  297. 'create_time_high' => 4,
  298. 'modify_time_low' => 4,
  299. 'modify_time_high' => 4,
  300. 'first_sector' => 4,
  301. 'size_low' => 4,
  302. 'size_high' => 4,
  303. ];
  304. $entryLength = array_sum( $struct );
  305. for ( $pos = 0; $pos < strlen( $binaryDir ); $pos += $entryLength ) {
  306. $entry = $this->unpack( $binaryDir, $pos, $struct );
  307. // According to [MS-CFB] size_high may contain garbage due to a
  308. // bug in a writer, it's best to pretend it is zero
  309. $entry['size_high'] = 0;
  310. $type = $entry['object_type'];
  311. if ( $type == self::TYPE_UNALLOCATED ) {
  312. continue;
  313. }
  314. $name = iconv( 'UTF-16LE', 'UTF-8', substr( $entry['name_raw'], 0, $entry['name_length'] - 2 ) );
  315. $clsid = $this->decodeClsid( $entry['clsid'] );
  316. if ( $type == self::TYPE_ROOT && isset( self::$mimesByClsid[$clsid] ) ) {
  317. $this->mimeFromClsid = self::$mimesByClsid[$clsid];
  318. }
  319. if ( $name === 'Workbook' ) {
  320. $this->mime = 'application/vnd.ms-excel';
  321. } elseif ( $name === 'WordDocument' ) {
  322. $this->mime = 'application/msword';
  323. } elseif ( $name === 'PowerPoint Document' ) {
  324. $this->mime = 'application/vnd.ms-powerpoint';
  325. }
  326. }
  327. }
  328. }