File_redirection.php 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. <?php
  2. /*
  3. * StatusNet - the distributed open-source microblogging tool
  4. * Copyright (C) 2008, 2009, StatusNet, Inc.
  5. *
  6. * This program is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU Affero General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU Affero General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Affero General Public License
  17. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. if (!defined('GNUSOCIAL')) { exit(1); }
  20. /**
  21. * Table Definition for file_redirection
  22. */
  23. class File_redirection extends Managed_DataObject
  24. {
  25. ###START_AUTOCODE
  26. /* the code below is auto generated do not remove the above tag */
  27. public $__table = 'file_redirection'; // table name
  28. public $url; // varchar(255) primary_key not_null
  29. public $file_id; // int(4)
  30. public $redirections; // int(4)
  31. public $httpcode; // int(4)
  32. public $modified; // timestamp() not_null default_CURRENT_TIMESTAMP
  33. /* the code above is auto generated do not remove the tag below */
  34. ###END_AUTOCODE
  35. public static function schemaDef()
  36. {
  37. return array(
  38. 'fields' => array(
  39. 'url' => array('type' => 'varchar', 'length' => 255, 'not null' => true, 'description' => 'short URL (or any other kind of redirect) for file (id)'),
  40. 'file_id' => array('type' => 'int', 'description' => 'short URL for what URL/file'),
  41. 'redirections' => array('type' => 'int', 'description' => 'redirect count'),
  42. 'httpcode' => array('type' => 'int', 'description' => 'HTTP status code (20x, 30x, etc.)'),
  43. 'modified' => array('type' => 'timestamp', 'not null' => true, 'description' => 'date this record was modified'),
  44. ),
  45. 'primary key' => array('url'),
  46. 'foreign keys' => array(
  47. 'file_redirection_file_id_fkey' => array('file' => array('file_id' => 'id')),
  48. ),
  49. );
  50. }
  51. static function _commonHttp($url, $redirs) {
  52. $request = new HTTPClient($url);
  53. $request->setConfig(array(
  54. 'connect_timeout' => 10, // # seconds to wait
  55. 'max_redirs' => $redirs, // # max number of http redirections to follow
  56. 'follow_redirects' => true, // Follow redirects
  57. 'store_body' => false, // We won't need body content here.
  58. ));
  59. return $request;
  60. }
  61. /**
  62. * Check if this URL is a redirect and return redir info.
  63. *
  64. * Most code should call File_redirection::where instead, to check if we
  65. * already know that redirection and avoid extra hits to the web.
  66. *
  67. * The URL is hit and any redirects are followed, up to 10 levels or until
  68. * a protected URL is reached.
  69. *
  70. * @param string $in_url
  71. * @return mixed one of:
  72. * string - target URL, if this is a direct link or can't be followed
  73. * array - redirect info if this is an *unknown* redirect:
  74. * associative array with the following elements:
  75. * code: HTTP status code
  76. * redirects: count of redirects followed
  77. * url: URL string of final target
  78. * type (optional): MIME type from Content-Type header
  79. * size (optional): byte size from Content-Length header
  80. * time (optional): timestamp from Last-Modified header
  81. */
  82. public function lookupWhere($short_url, $redirs = 10, $protected = false) {
  83. if ($redirs < 0) return false;
  84. if(strpos($short_url,'://') === false){
  85. return $short_url;
  86. }
  87. try {
  88. $request = self::_commonHttp($short_url, $redirs);
  89. // Don't include body in output
  90. $request->setMethod(HTTP_Request2::METHOD_HEAD);
  91. $response = $request->send();
  92. if (405 == $response->getStatus() || 204 == $response->getStatus()) {
  93. // HTTP 405 Unsupported Method
  94. // Server doesn't support HEAD method? Can this really happen?
  95. // We'll try again as a GET and ignore the response data.
  96. //
  97. // HTTP 204 No Content
  98. // YFrog sends 204 responses back for our HEAD checks, which
  99. // seems like it may be a logic error in their servers. If
  100. // we get a 204 back, re-run it as a GET... if there's really
  101. // no content it'll be cheap. :)
  102. $request = self::_commonHttp($short_url, $redirs);
  103. $response = $request->send();
  104. }
  105. } catch (Exception $e) {
  106. // Invalid URL or failure to reach server
  107. common_log(LOG_ERR, "Error while following redirects for $short_url: " . $e->getMessage());
  108. return $short_url;
  109. }
  110. if ($response->getRedirectCount() && File::isProtected($response->getUrl())) {
  111. // Bump back up the redirect chain until we find a non-protected URL
  112. return self::lookupWhere($short_url, $response->getRedirectCount() - 1, true);
  113. }
  114. $ret = array('code' => $response->getStatus()
  115. , 'redirects' => $response->getRedirectCount()
  116. , 'url' => $response->getUrl());
  117. $type = $response->getHeader('Content-Type');
  118. if ($type) $ret['type'] = $type;
  119. if ($protected) $ret['protected'] = true;
  120. $size = $response->getHeader('Content-Length'); // @fixme bytes?
  121. if ($size) $ret['size'] = $size;
  122. $time = $response->getHeader('Last-Modified');
  123. if ($time) $ret['time'] = strtotime($time);
  124. return $ret;
  125. }
  126. /**
  127. * Check if this URL is a redirect and return redir info.
  128. * If a File record is present for this URL, it is not considered a redirect.
  129. * If a File_redirection record is present for this URL, the recorded target is returned.
  130. *
  131. * If no File or File_redirect record is present, the URL is hit and any
  132. * redirects are followed, up to 10 levels or until a protected URL is
  133. * reached.
  134. *
  135. * @param string $in_url
  136. * @param boolean $discover true to attempt dereferencing the redirect if we don't know it already
  137. * @return mixed one of:
  138. * string - target URL, if this is a direct link or a known redirect
  139. * array - redirect info if this is an *unknown* redirect:
  140. * associative array with the following elements:
  141. * code: HTTP status code
  142. * redirects: count of redirects followed
  143. * url: URL string of final target
  144. * type (optional): MIME type from Content-Type header
  145. * size (optional): byte size from Content-Length header
  146. * time (optional): timestamp from Last-Modified header
  147. */
  148. public function where($in_url, $discover=true) {
  149. // let's see if we know this...
  150. $a = File::getKV('url', $in_url);
  151. if (!empty($a)) {
  152. // this is a direct link to $a->url
  153. return $a->url;
  154. } else {
  155. $b = File_redirection::getKV('url', $in_url);
  156. if (!empty($b)) {
  157. // this is a redirect to $b->file_id
  158. $a = File::getKV('id', $b->file_id);
  159. return $a->url;
  160. }
  161. }
  162. if ($discover) {
  163. $ret = File_redirection::lookupWhere($in_url);
  164. return $ret;
  165. } else {
  166. // No manual dereferencing; leave the unknown URL as is.
  167. return $in_url;
  168. }
  169. }
  170. /**
  171. * Shorten a URL with the current user's configured shortening
  172. * options, if applicable.
  173. *
  174. * If it cannot be shortened or the "short" URL is longer than the
  175. * original, the original is returned.
  176. *
  177. * If the referenced item has not been seen before, embedding data
  178. * may be saved.
  179. *
  180. * @param string $long_url
  181. * @param User $user whose shortening options to use; defaults to the current web session user
  182. * @return string
  183. */
  184. function makeShort($long_url, $user=null)
  185. {
  186. $canon = File_redirection::_canonUrl($long_url);
  187. $short_url = File_redirection::_userMakeShort($canon, $user);
  188. // Did we get one? Is it shorter?
  189. if (!empty($short_url)) {
  190. return $short_url;
  191. } else {
  192. return $long_url;
  193. }
  194. }
  195. /**
  196. * Shorten a URL with the current user's configured shortening
  197. * options, if applicable.
  198. *
  199. * If it cannot be shortened or the "short" URL is longer than the
  200. * original, the original is returned.
  201. *
  202. * If the referenced item has not been seen before, embedding data
  203. * may be saved.
  204. *
  205. * @param string $long_url
  206. * @return string
  207. */
  208. function forceShort($long_url, $user)
  209. {
  210. $canon = File_redirection::_canonUrl($long_url);
  211. $short_url = File_redirection::_userMakeShort($canon, $user, true);
  212. // Did we get one? Is it shorter?
  213. if (!empty($short_url)) {
  214. return $short_url;
  215. } else {
  216. return $long_url;
  217. }
  218. }
  219. function _userMakeShort($long_url, User $user=null, $force = false) {
  220. $short_url = common_shorten_url($long_url, $user, $force);
  221. if (!empty($short_url) && $short_url != $long_url) {
  222. $short_url = (string)$short_url;
  223. // store it
  224. $file = File::getKV('url', $long_url);
  225. if ($file instanceof File) {
  226. $file_id = $file->id;
  227. } else {
  228. // Check if the target URL is itself a redirect...
  229. $redir_data = File_redirection::where($long_url);
  230. if (is_array($redir_data)) {
  231. // We haven't seen the target URL before.
  232. // Save file and embedding data about it!
  233. $file = File::saveNew($redir_data, $long_url);
  234. $file_id = $file->id;
  235. } else if (is_string($redir_data)) {
  236. // The file is a known redirect target.
  237. $file = File::getKV('url', $redir_data);
  238. if (empty($file)) {
  239. // @fixme should we save a new one?
  240. // this case was triggering sometimes for redirects
  241. // with unresolvable targets; found while fixing
  242. // "can't linkify" bugs with shortened links to
  243. // SSL sites with cert issues.
  244. return null;
  245. }
  246. $file_id = $file->id;
  247. }
  248. }
  249. $file_redir = File_redirection::getKV('url', $short_url);
  250. if (!$file_redir instanceof File_redirection) {
  251. $file_redir = new File_redirection;
  252. $file_redir->url = $short_url;
  253. $file_redir->file_id = $file_id;
  254. $file_redir->insert();
  255. }
  256. return $short_url;
  257. }
  258. return null;
  259. }
  260. /**
  261. * Basic attempt to canonicalize a URL, cleaning up some standard variants
  262. * such as funny syntax or a missing path. Used internally when cleaning
  263. * up URLs for storage and following redirect chains.
  264. *
  265. * Note that despite being on File_redirect, this function DOES NOT perform
  266. * any dereferencing of redirects.
  267. *
  268. * @param string $in_url input URL
  269. * @param string $default_scheme if given a bare link; defaults to 'http://'
  270. * @return string
  271. */
  272. function _canonUrl($in_url, $default_scheme = 'http://') {
  273. if (empty($in_url)) return false;
  274. $out_url = $in_url;
  275. $p = parse_url($out_url);
  276. if (empty($p['host']) || empty($p['scheme'])) {
  277. list($scheme) = explode(':', $in_url, 2);
  278. switch (strtolower($scheme)) {
  279. case 'fax':
  280. case 'tel':
  281. $out_url = str_replace('.-()', '', $out_url);
  282. break;
  283. case 'mailto':
  284. case 'aim':
  285. case 'jabber':
  286. case 'xmpp':
  287. // don't touch anything
  288. break;
  289. default:
  290. $out_url = $default_scheme . ltrim($out_url, '/');
  291. $p = parse_url($out_url);
  292. if (empty($p['scheme'])) return false;
  293. break;
  294. }
  295. }
  296. if (('ftp' == $p['scheme']) || ('ftps' == $p['scheme']) || ('http' == $p['scheme']) || ('https' == $p['scheme'])) {
  297. if (empty($p['host'])) return false;
  298. if (empty($p['path'])) {
  299. $out_url .= '/';
  300. }
  301. }
  302. return $out_url;
  303. }
  304. function saveNew($data, $file_id, $url) {
  305. $file_redir = new File_redirection;
  306. $file_redir->url = $url;
  307. $file_redir->file_id = $file_id;
  308. $file_redir->redirections = intval($data['redirects']);
  309. $file_redir->httpcode = intval($data['code']);
  310. $file_redir->insert();
  311. }
  312. }