IEContentAnalyzer.php 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851
  1. <?php
  2. /**
  3. * Simulation of Microsoft Internet Explorer's MIME type detection algorithm.
  4. *
  5. * @file
  6. * @todo Define the exact license of this file.
  7. */
  8. /**
  9. * This class simulates Microsoft Internet Explorer's terribly broken and
  10. * insecure MIME type detection algorithm. It can be used to check web uploads
  11. * with an apparently safe type, to see if IE will reinterpret them to produce
  12. * something dangerous.
  13. *
  14. * It is full of bugs and strange design choices should not under any
  15. * circumstances be used to determine a MIME type to present to a user or
  16. * client. (Apple Safari developers, this means you too.)
  17. *
  18. * This class is based on a disassembly of IE 5.0, 6.0 and 7.0. Although I have
  19. * attempted to ensure that this code works in exactly the same way as Internet
  20. * Explorer, it does not share any source code, or creative choices such as
  21. * variable names, thus I (Tim Starling) claim copyright on it.
  22. *
  23. * It may be redistributed without restriction. To aid reuse, this class does
  24. * not depend on any MediaWiki module.
  25. */
  26. class IEContentAnalyzer {
  27. /**
  28. * Relevant data taken from the type table in IE 5
  29. */
  30. protected $baseTypeTable = [
  31. 'ambiguous' /*1*/ => [
  32. 'text/plain',
  33. 'application/octet-stream',
  34. 'application/x-netcdf', // [sic]
  35. ],
  36. 'text' /*3*/ => [
  37. 'text/richtext', 'image/x-bitmap', 'application/postscript', 'application/base64',
  38. 'application/macbinhex40', 'application/x-cdf', 'text/scriptlet'
  39. ],
  40. 'binary' /*4*/ => [
  41. 'application/pdf', 'audio/x-aiff', 'audio/basic', 'audio/wav', 'image/gif',
  42. 'image/pjpeg', 'image/jpeg', 'image/tiff', 'image/x-png', 'image/png', 'image/bmp',
  43. 'image/x-jg', 'image/x-art', 'image/x-emf', 'image/x-wmf', 'video/avi',
  44. 'video/x-msvideo', 'video/mpeg', 'application/x-compressed',
  45. 'application/x-zip-compressed', 'application/x-gzip-compressed', 'application/java',
  46. 'application/x-msdownload'
  47. ],
  48. 'html' /*5*/ => [ 'text/html' ],
  49. ];
  50. /**
  51. * Changes to the type table in later versions of IE
  52. */
  53. protected $addedTypes = [
  54. 'ie07' => [
  55. 'text' => [ 'text/xml', 'application/xml' ]
  56. ],
  57. ];
  58. /**
  59. * An approximation of the "Content Type" values in HKEY_CLASSES_ROOT in a
  60. * typical Windows installation.
  61. *
  62. * Used for extension to MIME type mapping if detection fails.
  63. */
  64. protected $registry = [
  65. '.323' => 'text/h323',
  66. '.3g2' => 'video/3gpp2',
  67. '.3gp' => 'video/3gpp',
  68. '.3gp2' => 'video/3gpp2',
  69. '.3gpp' => 'video/3gpp',
  70. '.aac' => 'audio/aac',
  71. '.ac3' => 'audio/ac3',
  72. '.accda' => 'application/msaccess',
  73. '.accdb' => 'application/msaccess',
  74. '.accdc' => 'application/msaccess',
  75. '.accde' => 'application/msaccess',
  76. '.accdr' => 'application/msaccess',
  77. '.accdt' => 'application/msaccess',
  78. '.ade' => 'application/msaccess',
  79. '.adp' => 'application/msaccess',
  80. '.adts' => 'audio/aac',
  81. '.ai' => 'application/postscript',
  82. '.aif' => 'audio/aiff',
  83. '.aifc' => 'audio/aiff',
  84. '.aiff' => 'audio/aiff',
  85. '.amc' => 'application/x-mpeg',
  86. '.application' => 'application/x-ms-application',
  87. '.asf' => 'video/x-ms-asf',
  88. '.asx' => 'video/x-ms-asf',
  89. '.au' => 'audio/basic',
  90. '.avi' => 'video/avi',
  91. '.bmp' => 'image/bmp',
  92. '.caf' => 'audio/x-caf',
  93. '.cat' => 'application/vnd.ms-pki.seccat',
  94. '.cbo' => 'application/sha',
  95. '.cdda' => 'audio/aiff',
  96. '.cer' => 'application/x-x509-ca-cert',
  97. '.conf' => 'text/plain',
  98. '.crl' => 'application/pkix-crl',
  99. '.crt' => 'application/x-x509-ca-cert',
  100. '.css' => 'text/css',
  101. '.csv' => 'application/vnd.ms-excel',
  102. '.der' => 'application/x-x509-ca-cert',
  103. '.dib' => 'image/bmp',
  104. '.dif' => 'video/x-dv',
  105. '.dll' => 'application/x-msdownload',
  106. '.doc' => 'application/msword',
  107. '.docm' => 'application/vnd.ms-word.document.macroEnabled.12',
  108. '.docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
  109. '.dot' => 'application/msword',
  110. '.dotm' => 'application/vnd.ms-word.template.macroEnabled.12',
  111. '.dotx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.template',
  112. '.dv' => 'video/x-dv',
  113. '.dwfx' => 'model/vnd.dwfx+xps',
  114. '.edn' => 'application/vnd.adobe.edn',
  115. '.eml' => 'message/rfc822',
  116. '.eps' => 'application/postscript',
  117. '.etd' => 'application/x-ebx',
  118. '.exe' => 'application/x-msdownload',
  119. '.fdf' => 'application/vnd.fdf',
  120. '.fif' => 'application/fractals',
  121. '.gif' => 'image/gif',
  122. '.gsm' => 'audio/x-gsm',
  123. '.hqx' => 'application/mac-binhex40',
  124. '.hta' => 'application/hta',
  125. '.htc' => 'text/x-component',
  126. '.htm' => 'text/html',
  127. '.html' => 'text/html',
  128. '.htt' => 'text/webviewhtml',
  129. '.hxa' => 'application/xml',
  130. '.hxc' => 'application/xml',
  131. '.hxd' => 'application/octet-stream',
  132. '.hxe' => 'application/xml',
  133. '.hxf' => 'application/xml',
  134. '.hxh' => 'application/octet-stream',
  135. '.hxi' => 'application/octet-stream',
  136. '.hxk' => 'application/xml',
  137. '.hxq' => 'application/octet-stream',
  138. '.hxr' => 'application/octet-stream',
  139. '.hxs' => 'application/octet-stream',
  140. '.hxt' => 'application/xml',
  141. '.hxv' => 'application/xml',
  142. '.hxw' => 'application/octet-stream',
  143. '.ico' => 'image/x-icon',
  144. '.iii' => 'application/x-iphone',
  145. '.ins' => 'application/x-internet-signup',
  146. '.iqy' => 'text/x-ms-iqy',
  147. '.isp' => 'application/x-internet-signup',
  148. '.jfif' => 'image/jpeg',
  149. '.jnlp' => 'application/x-java-jnlp-file',
  150. '.jpe' => 'image/jpeg',
  151. '.jpeg' => 'image/jpeg',
  152. '.jpg' => 'image/jpeg',
  153. '.jtx' => 'application/x-jtx+xps',
  154. '.latex' => 'application/x-latex',
  155. '.log' => 'text/plain',
  156. '.m1v' => 'video/mpeg',
  157. '.m2v' => 'video/mpeg',
  158. '.m3u' => 'audio/x-mpegurl',
  159. '.mac' => 'image/x-macpaint',
  160. '.man' => 'application/x-troff-man',
  161. '.mda' => 'application/msaccess',
  162. '.mdb' => 'application/msaccess',
  163. '.mde' => 'application/msaccess',
  164. '.mfp' => 'application/x-shockwave-flash',
  165. '.mht' => 'message/rfc822',
  166. '.mhtml' => 'message/rfc822',
  167. '.mid' => 'audio/mid',
  168. '.midi' => 'audio/mid',
  169. '.mod' => 'video/mpeg',
  170. '.mov' => 'video/quicktime',
  171. '.mp2' => 'video/mpeg',
  172. '.mp2v' => 'video/mpeg',
  173. '.mp3' => 'audio/mpeg',
  174. '.mp4' => 'video/mp4',
  175. '.mpa' => 'video/mpeg',
  176. '.mpe' => 'video/mpeg',
  177. '.mpeg' => 'video/mpeg',
  178. '.mpf' => 'application/vnd.ms-mediapackage',
  179. '.mpg' => 'video/mpeg',
  180. '.mpv2' => 'video/mpeg',
  181. '.mqv' => 'video/quicktime',
  182. '.NMW' => 'application/nmwb',
  183. '.nws' => 'message/rfc822',
  184. '.odc' => 'text/x-ms-odc',
  185. '.ols' => 'application/vnd.ms-publisher',
  186. '.p10' => 'application/pkcs10',
  187. '.p12' => 'application/x-pkcs12',
  188. '.p7b' => 'application/x-pkcs7-certificates',
  189. '.p7c' => 'application/pkcs7-mime',
  190. '.p7m' => 'application/pkcs7-mime',
  191. '.p7r' => 'application/x-pkcs7-certreqresp',
  192. '.p7s' => 'application/pkcs7-signature',
  193. '.pct' => 'image/pict',
  194. '.pdf' => 'application/pdf',
  195. '.pdx' => 'application/vnd.adobe.pdx',
  196. '.pfx' => 'application/x-pkcs12',
  197. '.pic' => 'image/pict',
  198. '.pict' => 'image/pict',
  199. '.pinstall' => 'application/x-picasa-detect',
  200. '.pko' => 'application/vnd.ms-pki.pko',
  201. '.png' => 'image/png',
  202. '.pnt' => 'image/x-macpaint',
  203. '.pntg' => 'image/x-macpaint',
  204. '.pot' => 'application/vnd.ms-powerpoint',
  205. '.potm' => 'application/vnd.ms-powerpoint.template.macroEnabled.12',
  206. '.potx' => 'application/vnd.openxmlformats-officedocument.presentationml.template',
  207. '.ppa' => 'application/vnd.ms-powerpoint',
  208. '.ppam' => 'application/vnd.ms-powerpoint.addin.macroEnabled.12',
  209. '.pps' => 'application/vnd.ms-powerpoint',
  210. '.ppsm' => 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12',
  211. '.ppsx' => 'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
  212. '.ppt' => 'application/vnd.ms-powerpoint',
  213. '.pptm' => 'application/vnd.ms-powerpoint.presentation.macroEnabled.12',
  214. '.pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
  215. '.prf' => 'application/pics-rules',
  216. '.ps' => 'application/postscript',
  217. '.pub' => 'application/vnd.ms-publisher',
  218. '.pwz' => 'application/vnd.ms-powerpoint',
  219. '.py' => 'text/plain',
  220. '.pyw' => 'text/plain',
  221. '.qht' => 'text/x-html-insertion',
  222. '.qhtm' => 'text/x-html-insertion',
  223. '.qt' => 'video/quicktime',
  224. '.qti' => 'image/x-quicktime',
  225. '.qtif' => 'image/x-quicktime',
  226. '.qtl' => 'application/x-quicktimeplayer',
  227. '.rat' => 'application/rat-file',
  228. '.rmf' => 'application/vnd.adobe.rmf',
  229. '.rmi' => 'audio/mid',
  230. '.rqy' => 'text/x-ms-rqy',
  231. '.rtf' => 'application/msword',
  232. '.sct' => 'text/scriptlet',
  233. '.sd2' => 'audio/x-sd2',
  234. '.sdp' => 'application/sdp',
  235. '.shtml' => 'text/html',
  236. '.sit' => 'application/x-stuffit',
  237. '.sldm' => 'application/vnd.ms-powerpoint.slide.macroEnabled.12',
  238. '.sldx' => 'application/vnd.openxmlformats-officedocument.presentationml.slide',
  239. '.slk' => 'application/vnd.ms-excel',
  240. '.snd' => 'audio/basic',
  241. '.so' => 'application/x-apachemodule',
  242. '.sol' => 'text/plain',
  243. '.sor' => 'text/plain',
  244. '.spc' => 'application/x-pkcs7-certificates',
  245. '.spl' => 'application/futuresplash',
  246. '.sst' => 'application/vnd.ms-pki.certstore',
  247. '.stl' => 'application/vnd.ms-pki.stl',
  248. '.swf' => 'application/x-shockwave-flash',
  249. '.thmx' => 'application/vnd.ms-officetheme',
  250. '.tif' => 'image/tiff',
  251. '.tiff' => 'image/tiff',
  252. '.txt' => 'text/plain',
  253. '.uls' => 'text/iuls',
  254. '.vcf' => 'text/x-vcard',
  255. '.vdx' => 'application/vnd.ms-visio.viewer',
  256. '.vsd' => 'application/vnd.ms-visio.viewer',
  257. '.vss' => 'application/vnd.ms-visio.viewer',
  258. '.vst' => 'application/vnd.ms-visio.viewer',
  259. '.vsx' => 'application/vnd.ms-visio.viewer',
  260. '.vtx' => 'application/vnd.ms-visio.viewer',
  261. '.wav' => 'audio/wav',
  262. '.wax' => 'audio/x-ms-wax',
  263. '.wbk' => 'application/msword',
  264. '.wdp' => 'image/vnd.ms-photo',
  265. '.wiz' => 'application/msword',
  266. '.wm' => 'video/x-ms-wm',
  267. '.wma' => 'audio/x-ms-wma',
  268. '.wmd' => 'application/x-ms-wmd',
  269. '.wmv' => 'video/x-ms-wmv',
  270. '.wmx' => 'video/x-ms-wmx',
  271. '.wmz' => 'application/x-ms-wmz',
  272. '.wpl' => 'application/vnd.ms-wpl',
  273. '.wsc' => 'text/scriptlet',
  274. '.wvx' => 'video/x-ms-wvx',
  275. '.xaml' => 'application/xaml+xml',
  276. '.xbap' => 'application/x-ms-xbap',
  277. '.xdp' => 'application/vnd.adobe.xdp+xml',
  278. '.xfdf' => 'application/vnd.adobe.xfdf',
  279. '.xht' => 'application/xhtml+xml',
  280. '.xhtml' => 'application/xhtml+xml',
  281. '.xla' => 'application/vnd.ms-excel',
  282. '.xlam' => 'application/vnd.ms-excel.addin.macroEnabled.12',
  283. '.xlk' => 'application/vnd.ms-excel',
  284. '.xll' => 'application/vnd.ms-excel',
  285. '.xlm' => 'application/vnd.ms-excel',
  286. '.xls' => 'application/vnd.ms-excel',
  287. '.xlsb' => 'application/vnd.ms-excel.sheet.binary.macroEnabled.12',
  288. '.xlsm' => 'application/vnd.ms-excel.sheet.macroEnabled.12',
  289. '.xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
  290. '.xlt' => 'application/vnd.ms-excel',
  291. '.xltm' => 'application/vnd.ms-excel.template.macroEnabled.12',
  292. '.xltx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.template',
  293. '.xlw' => 'application/vnd.ms-excel',
  294. '.xml' => 'text/xml',
  295. '.xps' => 'application/vnd.ms-xpsdocument',
  296. '.xsl' => 'text/xml',
  297. ];
  298. /**
  299. * IE versions which have been analysed to bring you this class, and for
  300. * which some substantive difference exists. These will appear as keys
  301. * in the return value of getRealMimesFromData(). The names are chosen to sort correctly.
  302. */
  303. protected $versions = [ 'ie05', 'ie06', 'ie07', 'ie07.strict', 'ie07.nohtml' ];
  304. /**
  305. * Type table with versions expanded
  306. */
  307. protected $typeTable = [];
  308. function __construct() {
  309. // Construct versioned type arrays from the base type array plus additions
  310. $types = $this->baseTypeTable;
  311. foreach ( $this->versions as $version ) {
  312. if ( isset( $this->addedTypes[$version] ) ) {
  313. foreach ( $this->addedTypes[$version] as $format => $addedTypes ) {
  314. $types[$format] = array_merge( $types[$format], $addedTypes );
  315. }
  316. }
  317. $this->typeTable[$version] = $types;
  318. }
  319. }
  320. /**
  321. * Get the MIME types from getMimesFromData(), but convert the result from IE's
  322. * idiosyncratic private types into something other apps will understand.
  323. *
  324. * @param string $fileName the file name (unused at present)
  325. * @param string $chunk the first 256 bytes of the file
  326. * @param string $proposed the MIME type proposed by the server
  327. *
  328. * @return array map of IE version to detected MIME type
  329. */
  330. public function getRealMimesFromData( $fileName, $chunk, $proposed ) {
  331. $types = $this->getMimesFromData( $fileName, $chunk, $proposed );
  332. $types = array_map( [ $this, 'translateMimeType' ], $types );
  333. return $types;
  334. }
  335. /**
  336. * Translate a MIME type from IE's idiosyncratic private types into
  337. * more commonly understood type strings
  338. * @param string $type
  339. * @return string
  340. */
  341. public function translateMimeType( $type ) {
  342. static $table = [
  343. 'image/pjpeg' => 'image/jpeg',
  344. 'image/x-png' => 'image/png',
  345. 'image/x-wmf' => 'application/x-msmetafile',
  346. 'image/bmp' => 'image/x-bmp',
  347. 'application/x-zip-compressed' => 'application/zip',
  348. 'application/x-compressed' => 'application/x-compress',
  349. 'application/x-gzip-compressed' => 'application/x-gzip',
  350. 'audio/mid' => 'audio/midi',
  351. ];
  352. if ( isset( $table[$type] ) ) {
  353. $type = $table[$type];
  354. }
  355. return $type;
  356. }
  357. /**
  358. * Get the untranslated MIME types for all known versions
  359. *
  360. * @param string $fileName the file name (unused at present)
  361. * @param string $chunk the first 256 bytes of the file
  362. * @param string $proposed the MIME type proposed by the server
  363. *
  364. * @return array map of IE version to detected MIME type
  365. */
  366. public function getMimesFromData( $fileName, $chunk, $proposed ) {
  367. $types = [];
  368. foreach ( $this->versions as $version ) {
  369. $types[$version] = $this->getMimeTypeForVersion( $version, $fileName, $chunk, $proposed );
  370. }
  371. return $types;
  372. }
  373. /**
  374. * Get the MIME type for a given named version
  375. * @param string $version
  376. * @param string $fileName
  377. * @param string $chunk
  378. * @param string $proposed
  379. * @return bool|string
  380. */
  381. protected function getMimeTypeForVersion( $version, $fileName, $chunk, $proposed ) {
  382. // Strip text after a semicolon
  383. $semiPos = strpos( $proposed, ';' );
  384. if ( $semiPos !== false ) {
  385. $proposed = substr( $proposed, 0, $semiPos );
  386. }
  387. $proposedFormat = $this->getDataFormat( $version, $proposed );
  388. if ( $proposedFormat == 'unknown'
  389. && $proposed != 'multipart/mixed'
  390. && $proposed != 'multipart/x-mixed-replace'
  391. ) {
  392. return $proposed;
  393. }
  394. if ( strval( $chunk ) === '' ) {
  395. return $proposed;
  396. }
  397. // Truncate chunk at 255 bytes
  398. $chunk = substr( $chunk, 0, 255 );
  399. // IE does the Check*Headers() calls last, and instead does the following image
  400. // type checks by directly looking for the magic numbers. What I do here should
  401. // have the same effect since the magic number checks are identical in both cases.
  402. $result = $this->sampleData( $version, $chunk );
  403. $sampleFound = $result['found'];
  404. $counters = $result['counters'];
  405. $binaryType = $this->checkBinaryHeaders( $version, $chunk );
  406. $textType = $this->checkTextHeaders( $version, $chunk );
  407. if ( $proposed == 'text/html' && isset( $sampleFound['html'] ) ) {
  408. return 'text/html';
  409. }
  410. if ( $proposed == 'image/gif' && $binaryType == 'image/gif' ) {
  411. return 'image/gif';
  412. }
  413. if ( ( $proposed == 'image/pjpeg' || $proposed == 'image/jpeg' )
  414. && $binaryType == 'image/pjpeg'
  415. ) {
  416. return $proposed;
  417. }
  418. // PNG check added in IE 7
  419. if ( $version >= 'ie07'
  420. && ( $proposed == 'image/x-png' || $proposed == 'image/png' )
  421. && $binaryType == 'image/x-png'
  422. ) {
  423. return $proposed;
  424. }
  425. // CDF was removed in IE 7 so it won't be in $sampleFound for later versions
  426. if ( isset( $sampleFound['cdf'] ) ) {
  427. return 'application/x-cdf';
  428. }
  429. // RSS and Atom were added in IE 7 so they won't be in $sampleFound for
  430. // previous versions
  431. if ( isset( $sampleFound['rss'] ) ) {
  432. return 'application/rss+xml';
  433. }
  434. if ( isset( $sampleFound['rdf-tag'] )
  435. && isset( $sampleFound['rdf-url'] )
  436. && isset( $sampleFound['rdf-purl'] )
  437. ) {
  438. return 'application/rss+xml';
  439. }
  440. if ( isset( $sampleFound['atom'] ) ) {
  441. return 'application/atom+xml';
  442. }
  443. if ( isset( $sampleFound['xml'] ) ) {
  444. // TODO: I'm not sure under what circumstances this flag is enabled
  445. if ( strpos( $version, 'strict' ) !== false ) {
  446. if ( $proposed == 'text/html' || $proposed == 'text/xml' ) {
  447. return 'text/xml';
  448. }
  449. } else {
  450. return 'text/xml';
  451. }
  452. }
  453. if ( isset( $sampleFound['html'] ) ) {
  454. // TODO: I'm not sure under what circumstances this flag is enabled
  455. if ( strpos( $version, 'nohtml' ) !== false ) {
  456. if ( $proposed == 'text/plain' ) {
  457. return 'text/html';
  458. }
  459. } else {
  460. return 'text/html';
  461. }
  462. }
  463. if ( isset( $sampleFound['xbm'] ) ) {
  464. return 'image/x-bitmap';
  465. }
  466. if ( isset( $sampleFound['binhex'] ) ) {
  467. return 'application/macbinhex40';
  468. }
  469. if ( isset( $sampleFound['scriptlet'] ) ) {
  470. if ( strpos( $version, 'strict' ) !== false ) {
  471. if ( $proposed == 'text/plain' || $proposed == 'text/scriptlet' ) {
  472. return 'text/scriptlet';
  473. }
  474. } else {
  475. return 'text/scriptlet';
  476. }
  477. }
  478. // Freaky heuristics to determine if the data is text or binary
  479. // The heuristic is of course broken for non-ASCII text
  480. if ( $counters['ctrl'] != 0 && ( $counters['ff'] + $counters['low'] )
  481. < ( $counters['ctrl'] + $counters['high'] ) * 16
  482. ) {
  483. $kindOfBinary = true;
  484. $type = $binaryType ?: $textType;
  485. if ( $type === false ) {
  486. $type = 'application/octet-stream';
  487. }
  488. } else {
  489. $kindOfBinary = false;
  490. $type = $textType ?: $binaryType;
  491. if ( $type === false ) {
  492. $type = 'text/plain';
  493. }
  494. }
  495. // Check if the output format is ambiguous
  496. // This generally means that detection failed, real types aren't ambiguous
  497. $detectedFormat = $this->getDataFormat( $version, $type );
  498. if ( $detectedFormat != 'ambiguous' ) {
  499. return $type;
  500. }
  501. if ( $proposedFormat != 'ambiguous' ) {
  502. // FormatAgreesWithData()
  503. if ( $proposedFormat == 'text' && !$kindOfBinary ) {
  504. return $proposed;
  505. }
  506. if ( $proposedFormat == 'binary' && $kindOfBinary ) {
  507. return $proposed;
  508. }
  509. if ( $proposedFormat == 'html' ) {
  510. return $proposed;
  511. }
  512. }
  513. // Find a MIME type by searching the registry for the file extension.
  514. $dotPos = strrpos( $fileName, '.' );
  515. if ( $dotPos === false ) {
  516. return $type;
  517. }
  518. $ext = substr( $fileName, $dotPos );
  519. if ( isset( $this->registry[$ext] ) ) {
  520. return $this->registry[$ext];
  521. }
  522. // TODO: If the extension has an application registered to it, IE will return
  523. // application/octet-stream. We'll skip that, so we could erroneously
  524. // return text/plain or application/x-netcdf where application/octet-stream
  525. // would be correct.
  526. return $type;
  527. }
  528. /**
  529. * Check for text headers at the start of the chunk
  530. * Confirmed same in 5 and 7.
  531. * @param string $version
  532. * @param string $chunk
  533. * @return bool|string
  534. */
  535. private function checkTextHeaders( $version, $chunk ) {
  536. $chunk2 = substr( $chunk, 0, 2 );
  537. $chunk4 = substr( $chunk, 0, 4 );
  538. $chunk5 = substr( $chunk, 0, 5 );
  539. if ( $chunk4 == '%PDF' ) {
  540. return 'application/pdf';
  541. }
  542. if ( $chunk2 == '%!' ) {
  543. return 'application/postscript';
  544. }
  545. if ( $chunk5 == '{\\rtf' ) {
  546. return 'text/richtext';
  547. }
  548. if ( $chunk5 == 'begin' ) {
  549. return 'application/base64';
  550. }
  551. return false;
  552. }
  553. /**
  554. * Check for binary headers at the start of the chunk
  555. * Confirmed same in 5 and 7.
  556. * @param string $version
  557. * @param string $chunk
  558. * @return bool|string
  559. */
  560. private function checkBinaryHeaders( $version, $chunk ) {
  561. $chunk2 = substr( $chunk, 0, 2 );
  562. $chunk3 = substr( $chunk, 0, 3 );
  563. $chunk4 = substr( $chunk, 0, 4 );
  564. $chunk5 = substr( $chunk, 0, 5 );
  565. $chunk5uc = strtoupper( $chunk5 );
  566. $chunk8 = substr( $chunk, 0, 8 );
  567. if ( $chunk5uc == 'GIF87' || $chunk5uc == 'GIF89' ) {
  568. return 'image/gif';
  569. }
  570. if ( $chunk2 == "\xff\xd8" ) {
  571. return 'image/pjpeg'; // actually plain JPEG but this is what IE returns
  572. }
  573. if ( $chunk2 == 'BM'
  574. && substr( $chunk, 6, 2 ) == "\000\000"
  575. && substr( $chunk, 8, 2 ) == "\000\000"
  576. ) {
  577. return 'image/bmp'; // another non-standard MIME
  578. }
  579. if ( $chunk4 == 'RIFF'
  580. && substr( $chunk, 8, 4 ) == 'WAVE'
  581. ) {
  582. return 'audio/wav';
  583. }
  584. // These were integer literals in IE
  585. // Perhaps the author was not sure what the target endianness was
  586. if ( $chunk4 == ".sd\000"
  587. || $chunk4 == ".snd"
  588. || $chunk4 == "\000ds."
  589. || $chunk4 == "dns."
  590. ) {
  591. return 'audio/basic';
  592. }
  593. if ( $chunk3 == "MM\000" ) {
  594. return 'image/tiff';
  595. }
  596. if ( $chunk2 == 'MZ' ) {
  597. return 'application/x-msdownload';
  598. }
  599. if ( $chunk8 == "\x89PNG\x0d\x0a\x1a\x0a" ) {
  600. return 'image/x-png'; // [sic]
  601. }
  602. if ( strlen( $chunk ) >= 5 ) {
  603. $byte2 = ord( $chunk[2] );
  604. $byte4 = ord( $chunk[4] );
  605. if ( $byte2 >= 3 && $byte2 <= 31 && $byte4 == 0 && $chunk2 == 'JG' ) {
  606. return 'image/x-jg';
  607. }
  608. }
  609. // More endian confusion?
  610. if ( $chunk4 == 'MROF' ) {
  611. return 'audio/x-aiff';
  612. }
  613. $chunk4_8 = substr( $chunk, 8, 4 );
  614. if ( $chunk4 == 'FORM' && ( $chunk4_8 == 'AIFF' || $chunk4_8 == 'AIFC' ) ) {
  615. return 'audio/x-aiff';
  616. }
  617. if ( $chunk4 == 'RIFF' && $chunk4_8 == 'AVI ' ) {
  618. return 'video/avi';
  619. }
  620. if ( $chunk4 == "\x00\x00\x01\xb3" || $chunk4 == "\x00\x00\x01\xba" ) {
  621. return 'video/mpeg';
  622. }
  623. if ( $chunk4 == "\001\000\000\000"
  624. && substr( $chunk, 40, 4 ) == ' EMF'
  625. ) {
  626. return 'image/x-emf';
  627. }
  628. if ( $chunk4 == "\xd7\xcd\xc6\x9a" ) {
  629. return 'image/x-wmf';
  630. }
  631. if ( $chunk4 == "\xca\xfe\xba\xbe" ) {
  632. return 'application/java';
  633. }
  634. if ( $chunk2 == 'PK' ) {
  635. return 'application/x-zip-compressed';
  636. }
  637. if ( $chunk2 == "\x1f\x9d" ) {
  638. return 'application/x-compressed';
  639. }
  640. if ( $chunk2 == "\x1f\x8b" ) {
  641. return 'application/x-gzip-compressed';
  642. }
  643. // Skip redundant check for ZIP
  644. if ( $chunk5 == "MThd\000" ) {
  645. return 'audio/mid';
  646. }
  647. if ( $chunk4 == '%PDF' ) {
  648. return 'application/pdf';
  649. }
  650. return false;
  651. }
  652. /**
  653. * Do heuristic checks on the bulk of the data sample.
  654. * Search for HTML tags.
  655. * @param string $version
  656. * @param string $chunk
  657. * @return array
  658. */
  659. protected function sampleData( $version, $chunk ) {
  660. $found = [];
  661. $counters = [
  662. 'ctrl' => 0,
  663. 'high' => 0,
  664. 'low' => 0,
  665. 'lf' => 0,
  666. 'cr' => 0,
  667. 'ff' => 0
  668. ];
  669. $htmlTags = [
  670. 'html',
  671. 'head',
  672. 'title',
  673. 'body',
  674. 'script',
  675. 'a href',
  676. 'pre',
  677. 'img',
  678. 'plaintext',
  679. 'table'
  680. ];
  681. $rdfUrl = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
  682. $rdfPurl = 'http://purl.org/rss/1.0/';
  683. $xbmMagic1 = '#define';
  684. $xbmMagic2 = '_width';
  685. $xbmMagic3 = '_bits';
  686. $binhexMagic = 'converted with BinHex';
  687. $chunkLength = strlen( $chunk );
  688. for ( $offset = 0; $offset < $chunkLength; $offset++ ) {
  689. $curChar = $chunk[$offset];
  690. if ( $curChar == "\x0a" ) {
  691. $counters['lf']++;
  692. continue;
  693. } elseif ( $curChar == "\x0d" ) {
  694. $counters['cr']++;
  695. continue;
  696. } elseif ( $curChar == "\x0c" ) {
  697. $counters['ff']++;
  698. continue;
  699. } elseif ( $curChar == "\t" ) {
  700. $counters['low']++;
  701. continue;
  702. } elseif ( ord( $curChar ) < 32 ) {
  703. $counters['ctrl']++;
  704. continue;
  705. } elseif ( ord( $curChar ) >= 128 ) {
  706. $counters['high']++;
  707. continue;
  708. }
  709. $counters['low']++;
  710. if ( $curChar == '<' ) {
  711. // XML
  712. $remainder = substr( $chunk, $offset + 1 );
  713. if ( !strncasecmp( $remainder, '?XML', 4 ) ) {
  714. $nextChar = substr( $chunk, $offset + 5, 1 );
  715. if ( $nextChar == ':' || $nextChar == ' ' || $nextChar == "\t" ) {
  716. $found['xml'] = true;
  717. }
  718. }
  719. // Scriptlet (JSP)
  720. if ( !strncasecmp( $remainder, 'SCRIPTLET', 9 ) ) {
  721. $found['scriptlet'] = true;
  722. break;
  723. }
  724. // HTML
  725. foreach ( $htmlTags as $tag ) {
  726. if ( !strncasecmp( $remainder, $tag, strlen( $tag ) ) ) {
  727. $found['html'] = true;
  728. }
  729. }
  730. // Skip broken check for additional tags (HR etc.)
  731. // CHANNEL replaced by RSS, RDF and FEED in IE 7
  732. if ( $version < 'ie07' ) {
  733. if ( !strncasecmp( $remainder, 'CHANNEL', 7 ) ) {
  734. $found['cdf'] = true;
  735. }
  736. } else {
  737. // RSS
  738. if ( !strncasecmp( $remainder, 'RSS', 3 ) ) {
  739. $found['rss'] = true;
  740. break; // return from SampleData
  741. }
  742. if ( !strncasecmp( $remainder, 'rdf:RDF', 7 ) ) {
  743. $found['rdf-tag'] = true;
  744. // no break
  745. }
  746. if ( !strncasecmp( $remainder, 'FEED', 4 ) ) {
  747. $found['atom'] = true;
  748. break;
  749. }
  750. }
  751. continue;
  752. }
  753. // Skip broken check for -->
  754. // RSS URL checks
  755. // For some reason both URLs must appear before it is recognised
  756. $remainder = substr( $chunk, $offset );
  757. if ( !strncasecmp( $remainder, $rdfUrl, strlen( $rdfUrl ) ) ) {
  758. $found['rdf-url'] = true;
  759. if ( isset( $found['rdf-tag'] )
  760. && isset( $found['rdf-purl'] ) // [sic]
  761. ) {
  762. break;
  763. }
  764. continue;
  765. }
  766. if ( !strncasecmp( $remainder, $rdfPurl, strlen( $rdfPurl ) ) ) {
  767. if ( isset( $found['rdf-tag'] )
  768. && isset( $found['rdf-url'] ) // [sic]
  769. ) {
  770. break;
  771. }
  772. continue;
  773. }
  774. // XBM checks
  775. if ( !strncasecmp( $remainder, $xbmMagic1, strlen( $xbmMagic1 ) ) ) {
  776. $found['xbm1'] = true;
  777. continue;
  778. }
  779. if ( $curChar == '_' ) {
  780. if ( isset( $found['xbm2'] ) ) {
  781. if ( !strncasecmp( $remainder, $xbmMagic3, strlen( $xbmMagic3 ) ) ) {
  782. $found['xbm'] = true;
  783. break;
  784. }
  785. } elseif ( isset( $found['xbm1'] ) ) {
  786. if ( !strncasecmp( $remainder, $xbmMagic2, strlen( $xbmMagic2 ) ) ) {
  787. $found['xbm2'] = true;
  788. }
  789. }
  790. }
  791. // BinHex
  792. if ( !strncmp( $remainder, $binhexMagic, strlen( $binhexMagic ) ) ) {
  793. $found['binhex'] = true;
  794. }
  795. }
  796. return [ 'found' => $found, 'counters' => $counters ];
  797. }
  798. /**
  799. * @param string $version
  800. * @param string|null $type
  801. * @return int|string
  802. */
  803. protected function getDataFormat( $version, $type ) {
  804. $types = $this->typeTable[$version];
  805. if ( $type == '(null)' || strval( $type ) === '' ) {
  806. return 'ambiguous';
  807. }
  808. foreach ( $types as $format => $list ) {
  809. if ( in_array( $type, $list ) ) {
  810. return $format;
  811. }
  812. }
  813. return 'unknown';
  814. }
  815. }