KURL.cpp 70 KB


  1. /*
  2. * Copyright (C) 2004, 2007, 2008, 2011, 2012, 2013 Apple Inc. All rights reserved.
  3. * Copyright (C) 2012 Research In Motion Limited. All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. *
  14. * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  15. * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17. * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  18. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  19. * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  20. * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  21. * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  22. * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include "config.h"
  27. #include "KURL.h"
  28. #include "DecodeEscapeSequences.h"
  29. #include "MIMETypeRegistry.h"
  30. #include "TextEncoding.h"
  31. #include <stdio.h>
  32. #include <wtf/HashMap.h>
  33. #include <wtf/HexNumber.h>
  34. #include <wtf/StdLibExtras.h>
  35. #include <wtf/text/CString.h>
  36. #include <wtf/text/StringBuilder.h>
  37. #include <wtf/text/StringHash.h>
  38. #if USE(ICU_UNICODE)
  39. #include <unicode/uidna.h>
  40. #endif
  41. // FIXME: This file makes too much use of the + operator on String.
  42. // We either have to optimize that operator so it doesn't involve
  43. // so many allocations, or change this to use StringBuffer instead.
  44. using namespace std;
  45. using namespace WTF;
  46. namespace WebCore {
  47. typedef Vector<char, 512> CharBuffer;
  48. typedef Vector<UChar, 512> UCharBuffer;
  49. static const unsigned maximumValidPortNumber = 0xFFFE;
  50. static const unsigned invalidPortNumber = 0xFFFF;
  51. static inline bool isLetterMatchIgnoringCase(UChar character, char lowercaseLetter)
  52. {
  53. ASSERT(isASCIILower(lowercaseLetter));
  54. return (character | 0x20) == lowercaseLetter;
  55. }
  56. static const char wsScheme[] = {'w', 's'};
  57. static const char ftpScheme[] = {'f', 't', 'p'};
  58. static const char ftpPort[] = {'2', '1'};
  59. static const char wssScheme[] = {'w', 's', 's'};
  60. static const char fileScheme[] = {'f', 'i', 'l', 'e'};
  61. static const char httpScheme[] = {'h', 't', 't', 'p'};
  62. static const char httpPort[] = {'8', '0'};
  63. static const char httpsScheme[] = {'h', 't', 't', 'p', 's'};
  64. static const char httpsPort[] = {'4', '4', '3'};
  65. static const char gopherScheme[] = {'g', 'o', 'p', 'h', 'e', 'r'};
  66. static const char gopherPort[] = {'7', '0'};
  67. static inline bool isLetterMatchIgnoringCase(char character, char lowercaseLetter)
  68. {
  69. ASSERT(isASCIILower(lowercaseLetter));
  70. return (character | 0x20) == lowercaseLetter;
  71. }
  72. enum URLCharacterClasses {
  73. // alpha
  74. SchemeFirstChar = 1 << 0,
  75. // ( alpha | digit | "+" | "-" | "." )
  76. SchemeChar = 1 << 1,
  77. // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
  78. // unreserved = alphanum | mark
  79. // ( unreserved | escaped | ";" | ":" | "&" | "=" | "+" | "$" | "," )
  80. UserInfoChar = 1 << 2,
  81. // alnum | "." | "-" | "%"
  82. // The above is what the specification says, but we are lenient to
  83. // match existing practice and also allow:
  84. // "_"
  85. HostnameChar = 1 << 3,
  86. // hexdigit | ":" | "%"
  87. IPv6Char = 1 << 4,
  88. // "#" | "?" | "/" | nul
  89. PathSegmentEndChar = 1 << 5,
  90. // not allowed in path
  91. BadChar = 1 << 6
  92. };
  93. static const unsigned char characterClassTable[256] = {
  94. /* 0 nul */ PathSegmentEndChar, /* 1 soh */ BadChar,
  95. /* 2 stx */ BadChar, /* 3 etx */ BadChar,
  96. /* 4 eot */ BadChar, /* 5 enq */ BadChar, /* 6 ack */ BadChar, /* 7 bel */ BadChar,
  97. /* 8 bs */ BadChar, /* 9 ht */ BadChar, /* 10 nl */ BadChar, /* 11 vt */ BadChar,
  98. /* 12 np */ BadChar, /* 13 cr */ BadChar, /* 14 so */ BadChar, /* 15 si */ BadChar,
  99. /* 16 dle */ BadChar, /* 17 dc1 */ BadChar, /* 18 dc2 */ BadChar, /* 19 dc3 */ BadChar,
  100. /* 20 dc4 */ BadChar, /* 21 nak */ BadChar, /* 22 syn */ BadChar, /* 23 etb */ BadChar,
  101. /* 24 can */ BadChar, /* 25 em */ BadChar, /* 26 sub */ BadChar, /* 27 esc */ BadChar,
  102. /* 28 fs */ BadChar, /* 29 gs */ BadChar, /* 30 rs */ BadChar, /* 31 us */ BadChar,
  103. /* 32 sp */ BadChar, /* 33 ! */ UserInfoChar,
  104. /* 34 " */ BadChar, /* 35 # */ PathSegmentEndChar | BadChar,
  105. /* 36 $ */ UserInfoChar, /* 37 % */ UserInfoChar | HostnameChar | IPv6Char | BadChar,
  106. /* 38 & */ UserInfoChar, /* 39 ' */ UserInfoChar,
  107. /* 40 ( */ UserInfoChar, /* 41 ) */ UserInfoChar,
  108. /* 42 * */ UserInfoChar, /* 43 + */ SchemeChar | UserInfoChar,
  109. /* 44 , */ UserInfoChar,
  110. /* 45 - */ SchemeChar | UserInfoChar | HostnameChar,
  111. /* 46 . */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  112. /* 47 / */ PathSegmentEndChar,
  113. /* 48 0 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  114. /* 49 1 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  115. /* 50 2 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  116. /* 51 3 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  117. /* 52 4 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  118. /* 53 5 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  119. /* 54 6 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  120. /* 55 7 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  121. /* 56 8 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  122. /* 57 9 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  123. /* 58 : */ UserInfoChar | IPv6Char, /* 59 ; */ UserInfoChar,
  124. /* 60 < */ BadChar, /* 61 = */ UserInfoChar,
  125. /* 62 > */ BadChar, /* 63 ? */ PathSegmentEndChar | BadChar,
  126. /* 64 @ */ 0,
  127. /* 65 A */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  128. /* 66 B */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  129. /* 67 C */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  130. /* 68 D */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  131. /* 69 E */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  132. /* 70 F */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  133. /* 71 G */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  134. /* 72 H */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  135. /* 73 I */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  136. /* 74 J */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  137. /* 75 K */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  138. /* 76 L */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  139. /* 77 M */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  140. /* 78 N */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  141. /* 79 O */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  142. /* 80 P */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  143. /* 81 Q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  144. /* 82 R */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  145. /* 83 S */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  146. /* 84 T */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  147. /* 85 U */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  148. /* 86 V */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  149. /* 87 W */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  150. /* 88 X */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  151. /* 89 Y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  152. /* 90 Z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  153. /* 91 [ */ 0,
  154. /* 92 \ */ 0, /* 93 ] */ 0,
  155. /* 94 ^ */ 0,
  156. /* 95 _ */ UserInfoChar | HostnameChar,
  157. /* 96 ` */ 0,
  158. /* 97 a */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  159. /* 98 b */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  160. /* 99 c */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  161. /* 100 d */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  162. /* 101 e */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  163. /* 102 f */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
  164. /* 103 g */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  165. /* 104 h */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  166. /* 105 i */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  167. /* 106 j */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  168. /* 107 k */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  169. /* 108 l */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  170. /* 109 m */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  171. /* 110 n */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  172. /* 111 o */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  173. /* 112 p */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  174. /* 113 q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  175. /* 114 r */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  176. /* 115 s */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  177. /* 116 t */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  178. /* 117 u */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  179. /* 118 v */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  180. /* 119 w */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  181. /* 120 x */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  182. /* 121 y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  183. /* 122 z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
  184. /* 123 { */ 0,
  185. /* 124 | */ 0, /* 125 } */ 0, /* 126 ~ */ UserInfoChar, /* 127 del */ BadChar,
  186. /* 128 */ BadChar, /* 129 */ BadChar, /* 130 */ BadChar, /* 131 */ BadChar,
  187. /* 132 */ BadChar, /* 133 */ BadChar, /* 134 */ BadChar, /* 135 */ BadChar,
  188. /* 136 */ BadChar, /* 137 */ BadChar, /* 138 */ BadChar, /* 139 */ BadChar,
  189. /* 140 */ BadChar, /* 141 */ BadChar, /* 142 */ BadChar, /* 143 */ BadChar,
  190. /* 144 */ BadChar, /* 145 */ BadChar, /* 146 */ BadChar, /* 147 */ BadChar,
  191. /* 148 */ BadChar, /* 149 */ BadChar, /* 150 */ BadChar, /* 151 */ BadChar,
  192. /* 152 */ BadChar, /* 153 */ BadChar, /* 154 */ BadChar, /* 155 */ BadChar,
  193. /* 156 */ BadChar, /* 157 */ BadChar, /* 158 */ BadChar, /* 159 */ BadChar,
  194. /* 160 */ BadChar, /* 161 */ BadChar, /* 162 */ BadChar, /* 163 */ BadChar,
  195. /* 164 */ BadChar, /* 165 */ BadChar, /* 166 */ BadChar, /* 167 */ BadChar,
  196. /* 168 */ BadChar, /* 169 */ BadChar, /* 170 */ BadChar, /* 171 */ BadChar,
  197. /* 172 */ BadChar, /* 173 */ BadChar, /* 174 */ BadChar, /* 175 */ BadChar,
  198. /* 176 */ BadChar, /* 177 */ BadChar, /* 178 */ BadChar, /* 179 */ BadChar,
  199. /* 180 */ BadChar, /* 181 */ BadChar, /* 182 */ BadChar, /* 183 */ BadChar,
  200. /* 184 */ BadChar, /* 185 */ BadChar, /* 186 */ BadChar, /* 187 */ BadChar,
  201. /* 188 */ BadChar, /* 189 */ BadChar, /* 190 */ BadChar, /* 191 */ BadChar,
  202. /* 192 */ BadChar, /* 193 */ BadChar, /* 194 */ BadChar, /* 195 */ BadChar,
  203. /* 196 */ BadChar, /* 197 */ BadChar, /* 198 */ BadChar, /* 199 */ BadChar,
  204. /* 200 */ BadChar, /* 201 */ BadChar, /* 202 */ BadChar, /* 203 */ BadChar,
  205. /* 204 */ BadChar, /* 205 */ BadChar, /* 206 */ BadChar, /* 207 */ BadChar,
  206. /* 208 */ BadChar, /* 209 */ BadChar, /* 210 */ BadChar, /* 211 */ BadChar,
  207. /* 212 */ BadChar, /* 213 */ BadChar, /* 214 */ BadChar, /* 215 */ BadChar,
  208. /* 216 */ BadChar, /* 217 */ BadChar, /* 218 */ BadChar, /* 219 */ BadChar,
  209. /* 220 */ BadChar, /* 221 */ BadChar, /* 222 */ BadChar, /* 223 */ BadChar,
  210. /* 224 */ BadChar, /* 225 */ BadChar, /* 226 */ BadChar, /* 227 */ BadChar,
  211. /* 228 */ BadChar, /* 229 */ BadChar, /* 230 */ BadChar, /* 231 */ BadChar,
  212. /* 232 */ BadChar, /* 233 */ BadChar, /* 234 */ BadChar, /* 235 */ BadChar,
  213. /* 236 */ BadChar, /* 237 */ BadChar, /* 238 */ BadChar, /* 239 */ BadChar,
  214. /* 240 */ BadChar, /* 241 */ BadChar, /* 242 */ BadChar, /* 243 */ BadChar,
  215. /* 244 */ BadChar, /* 245 */ BadChar, /* 246 */ BadChar, /* 247 */ BadChar,
  216. /* 248 */ BadChar, /* 249 */ BadChar, /* 250 */ BadChar, /* 251 */ BadChar,
  217. /* 252 */ BadChar, /* 253 */ BadChar, /* 254 */ BadChar, /* 255 */ BadChar
  218. };
  219. static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd);
  220. static void encodeRelativeString(const String& rel, const TextEncoding&, CharBuffer& ouput);
  221. static String substituteBackslashes(const String&);
  222. static inline bool isSchemeFirstChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeFirstChar; }
  223. static inline bool isSchemeFirstChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeFirstChar); }
  224. static inline bool isSchemeChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeChar; }
  225. static inline bool isSchemeChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeChar); }
  226. static inline bool isUserInfoChar(unsigned char c) { return characterClassTable[c] & UserInfoChar; }
  227. static inline bool isHostnameChar(unsigned char c) { return characterClassTable[c] & HostnameChar; }
  228. static inline bool isIPv6Char(unsigned char c) { return characterClassTable[c] & IPv6Char; }
  229. static inline bool isPathSegmentEndChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & PathSegmentEndChar; }
  230. static inline bool isPathSegmentEndChar(UChar c) { return c <= 0xff && (characterClassTable[c] & PathSegmentEndChar); }
  231. static inline bool isBadChar(unsigned char c) { return characterClassTable[c] & BadChar; }
  232. static inline bool isSchemeCharacterMatchIgnoringCase(char character, char schemeCharacter)
  233. {
  234. ASSERT(isSchemeChar(character));
  235. ASSERT(schemeCharacter & 0x20);
  236. ASSERT(isASCIILower(schemeCharacter) || (!isASCIIUpper(schemeCharacter) && isSchemeChar(schemeCharacter)));
  237. return (character | 0x20) == schemeCharacter;
  238. }
  239. // Copies the source to the destination, assuming all the source characters are
  240. // ASCII. The destination buffer must be large enough. Null characters are allowed
  241. // in the source string, and no attempt is made to null-terminate the result.
  242. static void copyASCII(const String& string, char* dest)
  243. {
  244. if (string.isEmpty())
  245. return;
  246. if (string.is8Bit())
  247. memcpy(dest, string.characters8(), string.length());
  248. else {
  249. const UChar* src = string.characters16();
  250. size_t length = string.length();
  251. for (size_t i = 0; i < length; i++)
  252. dest[i] = static_cast<char>(src[i]);
  253. }
  254. }
  255. static void appendASCII(const String& base, const char* rel, size_t len, CharBuffer& buffer)
  256. {
  257. buffer.resize(base.length() + len + 1);
  258. copyASCII(base, buffer.data());
  259. memcpy(buffer.data() + base.length(), rel, len);
  260. buffer[buffer.size() - 1] = '\0';
  261. }
  262. // FIXME: Move to WTFString.h eventually.
  263. // Returns the index of the first index in string |s| of any of the characters
  264. // in |toFind|. |toFind| should be a null-terminated string, all characters up
  265. // to the null will be searched. Returns int if not found.
  266. static int findFirstOf(const UChar* s, int sLen, int startPos, const char* toFind)
  267. {
  268. for (int i = startPos; i < sLen; i++) {
  269. const char* cur = toFind;
  270. while (*cur) {
  271. if (s[i] == *(cur++))
  272. return i;
  273. }
  274. }
  275. return -1;
  276. }
  277. static inline void checkEncodedString(const String& url)
  278. {
  279. ASSERT_UNUSED(url, url.containsOnlyASCII());
  280. ASSERT_UNUSED(url, url.isEmpty() || isSchemeFirstChar(url[0]));
  281. }
  282. inline bool KURL::protocolIs(const String& string, const char* protocol)
  283. {
  284. return WebCore::protocolIs(string, protocol);
  285. }
  286. void KURL::invalidate()
  287. {
  288. m_isValid = false;
  289. m_protocolIsInHTTPFamily = false;
  290. m_schemeEnd = 0;
  291. m_userStart = 0;
  292. m_userEnd = 0;
  293. m_passwordEnd = 0;
  294. m_hostEnd = 0;
  295. m_portEnd = 0;
  296. m_pathEnd = 0;
  297. m_pathAfterLastSlash = 0;
  298. m_queryEnd = 0;
  299. m_fragmentEnd = 0;
  300. }
  301. KURL::KURL(ParsedURLStringTag, const String& url)
  302. {
  303. parse(url);
  304. ASSERT(url == m_string);
  305. }
  306. KURL::KURL(const KURL& base, const String& relative)
  307. {
  308. init(base, relative, UTF8Encoding());
  309. }
  310. KURL::KURL(const KURL& base, const String& relative, const TextEncoding& encoding)
  311. {
  312. // For UTF-{7,16,32}, we want to use UTF-8 for the query part as
  313. // we do when submitting a form. A form with GET method
  314. // has its contents added to a URL as query params and it makes sense
  315. // to be consistent.
  316. init(base, relative, encoding.encodingForFormSubmission());
  317. }
  318. static bool shouldTrimFromURL(unsigned char c)
  319. {
  320. // Browsers ignore leading/trailing whitespace and control
  321. // characters from URLs. Note that c is an *unsigned* char here
  322. // so this comparison should only catch control characters.
  323. return c <= ' ';
  324. }
  325. void KURL::init(const KURL& base, const String& relative, const TextEncoding& encoding)
  326. {
  327. // Allow resolutions with a null or empty base URL, but not with any other invalid one.
  328. // FIXME: Is this a good rule?
  329. if (!base.m_isValid && !base.isEmpty()) {
  330. m_string = relative;
  331. invalidate();
  332. return;
  333. }
  334. // For compatibility with Win IE, treat backslashes as if they were slashes,
  335. // as long as we're not dealing with javascript: or data: URLs.
  336. String rel = relative;
  337. if (rel.contains('\\') && !(protocolIsJavaScript(rel) || protocolIs(rel, "data")))
  338. rel = substituteBackslashes(rel);
  339. bool allASCII = rel.containsOnlyASCII();
  340. CharBuffer strBuffer;
  341. char* str;
  342. size_t len;
  343. if (allASCII) {
  344. len = rel.length();
  345. strBuffer.resize(len + 1);
  346. copyASCII(rel, strBuffer.data());
  347. strBuffer[len] = 0;
  348. str = strBuffer.data();
  349. } else {
  350. encodeRelativeString(rel, encoding, strBuffer);
  351. str = strBuffer.data();
  352. len = strlen(str);
  353. }
  354. // Get rid of leading whitespace and control characters.
  355. while (len && shouldTrimFromURL(*str)) {
  356. str++;
  357. --len;
  358. }
  359. // Get rid of trailing whitespace and control characters.
  360. while (len && shouldTrimFromURL(str[len - 1]))
  361. str[--len] = '\0';
  362. // According to the RFC, the reference should be interpreted as an
  363. // absolute URI if possible, using the "leftmost, longest"
  364. // algorithm. If the URI reference is absolute it will have a
  365. // scheme, meaning that it will have a colon before the first
  366. // non-scheme element.
  367. bool absolute = false;
  368. char* p = str;
  369. if (isSchemeFirstChar(*p)) {
  370. ++p;
  371. while (isSchemeChar(*p)) {
  372. ++p;
  373. }
  374. if (*p == ':') {
  375. if (p[1] != '/' && equalIgnoringCase(base.protocol(), String(str, p - str)) && base.isHierarchical())
  376. str = p + 1;
  377. else
  378. absolute = true;
  379. }
  380. }
  381. CharBuffer parseBuffer;
  382. if (absolute) {
  383. parse(str, &relative);
  384. } else {
  385. // If the base is empty or opaque (e.g. data: or javascript:), then the URL is invalid
  386. // unless the relative URL is a single fragment.
  387. if (!base.isHierarchical()) {
  388. if (str[0] == '#') {
  389. appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
  390. parse(parseBuffer.data(), &relative);
  391. } else {
  392. m_string = relative;
  393. invalidate();
  394. }
  395. return;
  396. }
  397. switch (str[0]) {
  398. case '\0':
  399. // The reference is empty, so this is a reference to the same document with any fragment identifier removed.
  400. *this = base;
  401. removeFragmentIdentifier();
  402. break;
  403. case '#': {
  404. // must be fragment-only reference
  405. appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
  406. parse(parseBuffer.data(), &relative);
  407. break;
  408. }
  409. case '?': {
  410. // query-only reference, special case needed for non-URL results
  411. appendASCII(base.m_string.left(base.m_pathEnd), str, len, parseBuffer);
  412. parse(parseBuffer.data(), &relative);
  413. break;
  414. }
  415. case '/':
  416. // must be net-path or absolute-path reference
  417. if (str[1] == '/') {
  418. // net-path
  419. appendASCII(base.m_string.left(base.m_schemeEnd + 1), str, len, parseBuffer);
  420. parse(parseBuffer.data(), &relative);
  421. } else {
  422. // abs-path
  423. appendASCII(base.m_string.left(base.m_portEnd), str, len, parseBuffer);
  424. parse(parseBuffer.data(), &relative);
  425. }
  426. break;
  427. default:
  428. {
  429. // must be relative-path reference
  430. // Base part plus relative part plus one possible slash added in between plus terminating \0 byte.
  431. const size_t bufferSize = base.m_pathEnd + 1 + len + 1;
  432. parseBuffer.resize(bufferSize);
  433. char* bufferPos = parseBuffer.data();
  434. char* bufferStart = bufferPos;
  435. // first copy everything before the path from the base
  436. CharBuffer baseStringBuffer(base.m_string.length());
  437. copyASCII(base.m_string, baseStringBuffer.data());
  438. const char* baseString = baseStringBuffer.data();
  439. const char* baseStringStart = baseString;
  440. const char* pathStart = baseStringStart + base.m_portEnd;
  441. while (baseStringStart < pathStart)
  442. *bufferPos++ = *baseStringStart++;
  443. char* bufferPathStart = bufferPos;
  444. // now copy the base path
  445. const char* baseStringEnd = baseString + base.m_pathEnd;
  446. // go back to the last slash
  447. while (baseStringEnd > baseStringStart && baseStringEnd[-1] != '/')
  448. baseStringEnd--;
  449. if (baseStringEnd == baseStringStart) {
  450. // no path in base, add a path separator if necessary
  451. if (base.m_schemeEnd + 1 != base.m_pathEnd && *str && *str != '?' && *str != '#')
  452. *bufferPos++ = '/';
  453. } else {
  454. bufferPos += copyPathRemovingDots(bufferPos, baseStringStart, 0, baseStringEnd - baseStringStart);
  455. }
  456. const char* relStringStart = str;
  457. const char* relStringPos = relStringStart;
  458. while (*relStringPos && *relStringPos != '?' && *relStringPos != '#') {
  459. if (relStringPos[0] == '.' && bufferPos[-1] == '/') {
  460. if (isPathSegmentEndChar(relStringPos[1])) {
  461. // skip over "." segment
  462. relStringPos += 1;
  463. if (relStringPos[0] == '/')
  464. relStringPos++;
  465. continue;
  466. } else if (relStringPos[1] == '.' && isPathSegmentEndChar(relStringPos[2])) {
  467. // skip over ".." segment and rewind the last segment
  468. // the RFC leaves it up to the app to decide what to do with excess
  469. // ".." segments - we choose to drop them since some web content
  470. // relies on this.
  471. relStringPos += 2;
  472. if (relStringPos[0] == '/')
  473. relStringPos++;
  474. if (bufferPos > bufferPathStart + 1)
  475. bufferPos--;
  476. while (bufferPos > bufferPathStart + 1 && bufferPos[-1] != '/')
  477. bufferPos--;
  478. continue;
  479. }
  480. }
  481. *bufferPos = *relStringPos;
  482. relStringPos++;
  483. bufferPos++;
  484. }
  485. // all done with the path work, now copy any remainder
  486. // of the relative reference; this will also add a null terminator
  487. strncpy(bufferPos, relStringPos, bufferSize - (bufferPos - bufferStart));
  488. parse(parseBuffer.data(), &relative);
  489. ASSERT(strlen(parseBuffer.data()) + 1 <= parseBuffer.size());
  490. break;
  491. }
  492. }
  493. }
  494. }
  495. KURL KURL::copy() const
  496. {
  497. KURL result = *this;
  498. result.m_string = result.m_string.isolatedCopy();
  499. return result;
  500. }
  501. String KURL::lastPathComponent() const
  502. {
  503. if (!hasPath())
  504. return String();
  505. unsigned end = m_pathEnd - 1;
  506. if (m_string[end] == '/')
  507. --end;
  508. size_t start = m_string.reverseFind('/', end);
  509. if (start < static_cast<unsigned>(m_portEnd))
  510. return String();
  511. ++start;
  512. return m_string.substring(start, end - start + 1);
  513. }
  514. String KURL::protocol() const
  515. {
  516. return m_string.left(m_schemeEnd);
  517. }
  518. String KURL::host() const
  519. {
  520. int start = hostStart();
  521. String substring = m_string.substring(start, m_hostEnd - start);
  522. return substring.isNull() ? emptyString() : substring;
  523. }
  524. unsigned short KURL::port() const
  525. {
  526. // We return a port of 0 if there is no port specified. This can happen in two situations:
  527. // 1) The URL contains no colon after the host name and before the path component of the URL.
  528. // 2) The URL contains a colon but there's no port number before the path component of the URL begins.
  529. if (m_hostEnd == m_portEnd || m_hostEnd == m_portEnd - 1)
  530. return 0;
  531. const UChar* stringData = m_string.characters();
  532. bool ok = false;
  533. unsigned number = charactersToUIntStrict(stringData + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok);
  534. if (!ok || number > maximumValidPortNumber)
  535. return invalidPortNumber;
  536. return number;
  537. }
  538. String KURL::pass() const
  539. {
  540. if (m_passwordEnd == m_userEnd)
  541. return String();
  542. return decodeURLEscapeSequences(m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1));
  543. }
  544. String KURL::user() const
  545. {
  546. return decodeURLEscapeSequences(m_string.substring(m_userStart, m_userEnd - m_userStart));
  547. }
  548. String KURL::fragmentIdentifier() const
  549. {
  550. if (m_fragmentEnd == m_queryEnd)
  551. return String();
  552. return m_string.substring(m_queryEnd + 1, m_fragmentEnd - (m_queryEnd + 1));
  553. }
  554. bool KURL::hasFragmentIdentifier() const
  555. {
  556. return m_fragmentEnd != m_queryEnd;
  557. }
  558. String KURL::baseAsString() const
  559. {
  560. return m_string.left(m_pathAfterLastSlash);
  561. }
  562. #if !PLATFORM(QT) && !USE(CF)
  563. String KURL::fileSystemPath() const
  564. {
  565. if (!isValid() || !isLocalFile())
  566. return String();
  567. return decodeURLEscapeSequences(path());
  568. }
  569. #endif
  570. #ifdef NDEBUG
  571. static inline void assertProtocolIsGood(const char*)
  572. {
  573. }
  574. #else
  575. static void assertProtocolIsGood(const char* protocol)
  576. {
  577. const char* p = protocol;
  578. while (*p) {
  579. ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z'));
  580. ++p;
  581. }
  582. }
  583. #endif
  584. bool KURL::protocolIs(const char* protocol) const
  585. {
  586. assertProtocolIsGood(protocol);
  587. // JavaScript URLs are "valid" and should be executed even if KURL decides they are invalid.
  588. // The free function protocolIsJavaScript() should be used instead.
  589. ASSERT(!equalIgnoringCase(protocol, String("javascript")));
  590. if (!m_isValid)
  591. return false;
  592. // Do the comparison without making a new string object.
  593. for (int i = 0; i < m_schemeEnd; ++i) {
  594. if (!protocol[i] || !isSchemeCharacterMatchIgnoringCase(m_string[i], protocol[i]))
  595. return false;
  596. }
  597. return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument.
  598. }
  599. String KURL::query() const
  600. {
  601. if (m_queryEnd == m_pathEnd)
  602. return String();
  603. return m_string.substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1));
  604. }
  605. String KURL::path() const
  606. {
  607. return m_string.substring(m_portEnd, m_pathEnd - m_portEnd);
  608. }
  609. bool KURL::setProtocol(const String& s)
  610. {
  611. // Firefox and IE remove everything after the first ':'.
  612. size_t separatorPosition = s.find(':');
  613. String newProtocol = s.substring(0, separatorPosition);
  614. if (!isValidProtocol(newProtocol))
  615. return false;
  616. if (!m_isValid) {
  617. parse(newProtocol + ':' + m_string);
  618. return true;
  619. }
  620. parse(newProtocol + m_string.substring(m_schemeEnd));
  621. return true;
  622. }
  623. void KURL::setHost(const String& s)
  624. {
  625. if (!m_isValid)
  626. return;
  627. // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
  628. // and to avoid changing more than just the host.
  629. bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
  630. parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + s + m_string.substring(m_hostEnd));
  631. }
  632. void KURL::removePort()
  633. {
  634. if (m_hostEnd == m_portEnd)
  635. return;
  636. parse(m_string.left(m_hostEnd) + m_string.substring(m_portEnd));
  637. }
  638. void KURL::setPort(unsigned short i)
  639. {
  640. if (!m_isValid)
  641. return;
  642. bool colonNeeded = m_portEnd == m_hostEnd;
  643. int portStart = (colonNeeded ? m_hostEnd : m_hostEnd + 1);
  644. parse(m_string.left(portStart) + (colonNeeded ? ":" : "") + String::number(i) + m_string.substring(m_portEnd));
  645. }
  646. void KURL::setHostAndPort(const String& hostAndPort)
  647. {
  648. if (!m_isValid)
  649. return;
  650. // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
  651. // and to avoid changing more than just host and port.
  652. bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
  653. parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + hostAndPort + m_string.substring(m_portEnd));
  654. }
  655. void KURL::setUser(const String& user)
  656. {
  657. if (!m_isValid)
  658. return;
  659. // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
  660. // and to avoid changing more than just the user login.
  661. int end = m_userEnd;
  662. if (!user.isEmpty()) {
  663. String u = user;
  664. if (m_userStart == m_schemeEnd + 1)
  665. u = "//" + u;
  666. // Add '@' if we didn't have one before.
  667. if (end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@'))
  668. u.append('@');
  669. parse(m_string.left(m_userStart) + u + m_string.substring(end));
  670. } else {
  671. // Remove '@' if we now have neither user nor password.
  672. if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@')
  673. end += 1;
  674. // We don't want to parse in the extremely common case where we are not going to make a change.
  675. if (m_userStart != end)
  676. parse(m_string.left(m_userStart) + m_string.substring(end));
  677. }
  678. }
  679. void KURL::setPass(const String& password)
  680. {
  681. if (!m_isValid)
  682. return;
  683. // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
  684. // and to avoid changing more than just the user password.
  685. int end = m_passwordEnd;
  686. if (!password.isEmpty()) {
  687. String p = ":" + password + "@";
  688. if (m_userEnd == m_schemeEnd + 1)
  689. p = "//" + p;
  690. // Eat the existing '@' since we are going to add our own.
  691. if (end != m_hostEnd && m_string[end] == '@')
  692. end += 1;
  693. parse(m_string.left(m_userEnd) + p + m_string.substring(end));
  694. } else {
  695. // Remove '@' if we now have neither user nor password.
  696. if (m_userStart == m_userEnd && end != m_hostEnd && m_string[end] == '@')
  697. end += 1;
  698. // We don't want to parse in the extremely common case where we are not going to make a change.
  699. if (m_userEnd != end)
  700. parse(m_string.left(m_userEnd) + m_string.substring(end));
  701. }
  702. }
  703. void KURL::setFragmentIdentifier(const String& s)
  704. {
  705. if (!m_isValid)
  706. return;
  707. // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations.
  708. parse(m_string.left(m_queryEnd) + "#" + s);
  709. }
  710. void KURL::removeFragmentIdentifier()
  711. {
  712. if (!m_isValid)
  713. return;
  714. parse(m_string.left(m_queryEnd));
  715. }
  716. void KURL::setQuery(const String& query)
  717. {
  718. if (!m_isValid)
  719. return;
  720. // FIXME: '#' and non-ASCII characters must be encoded and escaped.
  721. // Usually, the query is encoded using document encoding, not UTF-8, but we don't have
  722. // access to the document in this function.
  723. if ((query.isEmpty() || query[0] != '?') && !query.isNull())
  724. parse(m_string.left(m_pathEnd) + "?" + query + m_string.substring(m_queryEnd));
  725. else
  726. parse(m_string.left(m_pathEnd) + query + m_string.substring(m_queryEnd));
  727. }
  728. void KURL::setPath(const String& s)
  729. {
  730. if (!m_isValid)
  731. return;
  732. // FIXME: encodeWithURLEscapeSequences does not correctly escape '#' and '?', so fragment and query parts
  733. // may be inadvertently affected.
  734. String path = s;
  735. if (path.isEmpty() || path[0] != '/')
  736. path = "/" + path;
  737. parse(m_string.left(m_portEnd) + encodeWithURLEscapeSequences(path) + m_string.substring(m_pathEnd));
  738. }
  739. String decodeURLEscapeSequences(const String& string)
  740. {
  741. return decodeEscapeSequences<URLEscapeSequence>(string, UTF8Encoding());
  742. }
  743. String decodeURLEscapeSequences(const String& string, const TextEncoding& encoding)
  744. {
  745. return decodeEscapeSequences<URLEscapeSequence>(string, encoding);
  746. }
  747. // Caution: This function does not bounds check.
  748. static void appendEscapedChar(char*& buffer, unsigned char c)
  749. {
  750. *buffer++ = '%';
  751. placeByteAsHex(c, buffer);
  752. }
  753. static void appendEscapingBadChars(char*& buffer, const char* strStart, size_t length)
  754. {
  755. char* p = buffer;
  756. const char* str = strStart;
  757. const char* strEnd = strStart + length;
  758. while (str < strEnd) {
  759. unsigned char c = *str++;
  760. if (isBadChar(c)) {
  761. if (c == '%' || c == '?')
  762. *p++ = c;
  763. else if (c != 0x09 && c != 0x0a && c != 0x0d)
  764. appendEscapedChar(p, c);
  765. } else
  766. *p++ = c;
  767. }
  768. buffer = p;
  769. }
  770. static void escapeAndAppendNonHierarchicalPart(char*& buffer, const char* strStart, size_t length)
  771. {
  772. char* p = buffer;
  773. const char* str = strStart;
  774. const char* strEnd = strStart + length;
  775. while (str < strEnd) {
  776. unsigned char c = *str++;
  777. // Strip CR, LF and Tab from fragments, per:
  778. // https://bugs.webkit.org/show_bug.cgi?id=8770
  779. if (c == 0x09 || c == 0x0a || c == 0x0d)
  780. continue;
  781. // Chrome and IE allow non-ascii characters in fragments, however doing
  782. // so would hit an ASSERT in checkEncodedString, so for now we don't.
  783. if (c < 0x20 || c >= 127) {
  784. appendEscapedChar(p, c);
  785. continue;
  786. }
  787. *p++ = c;
  788. }
  789. buffer = p;
  790. }
  791. // copy a path, accounting for "." and ".." segments
  792. static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd)
  793. {
  794. char* bufferPathStart = dst;
  795. // empty path is a special case, and need not have a leading slash
  796. if (srcStart != srcEnd) {
  797. const char* baseStringStart = src + srcStart;
  798. const char* baseStringEnd = src + srcEnd;
  799. const char* baseStringPos = baseStringStart;
  800. // this code is unprepared for paths that do not begin with a
  801. // slash and we should always have one in the source string
  802. ASSERT(baseStringPos[0] == '/');
  803. // copy the leading slash into the destination
  804. *dst = *baseStringPos;
  805. baseStringPos++;
  806. dst++;
  807. while (baseStringPos < baseStringEnd) {
  808. if (baseStringPos[0] == '.' && dst[-1] == '/') {
  809. if (baseStringPos[1] == '/' || baseStringPos + 1 == baseStringEnd) {
  810. // skip over "." segment
  811. baseStringPos += 2;
  812. continue;
  813. } else if (baseStringPos[1] == '.' && (baseStringPos[2] == '/' ||
  814. baseStringPos + 2 == baseStringEnd)) {
  815. // skip over ".." segment and rewind the last segment
  816. // the RFC leaves it up to the app to decide what to do with excess
  817. // ".." segments - we choose to drop them since some web content
  818. // relies on this.
  819. baseStringPos += 3;
  820. if (dst > bufferPathStart + 1)
  821. dst--;
  822. while (dst > bufferPathStart && dst[-1] != '/')
  823. dst--;
  824. continue;
  825. }
  826. }
  827. *dst = *baseStringPos;
  828. baseStringPos++;
  829. dst++;
  830. }
  831. }
  832. *dst = '\0';
  833. return dst - bufferPathStart;
  834. }
  835. static inline bool hasSlashDotOrDotDot(const char* str)
  836. {
  837. const unsigned char* p = reinterpret_cast<const unsigned char*>(str);
  838. if (!*p)
  839. return false;
  840. unsigned char pc = *p;
  841. while (unsigned char c = *++p) {
  842. if (c == '.' && (pc == '/' || pc == '.'))
  843. return true;
  844. pc = c;
  845. }
  846. return false;
  847. }
  848. void KURL::parse(const String& string)
  849. {
  850. checkEncodedString(string);
  851. CharBuffer buffer(string.length() + 1);
  852. copyASCII(string, buffer.data());
  853. buffer[string.length()] = '\0';
  854. parse(buffer.data(), &string);
  855. }
  856. template<size_t length>
  857. static inline bool equal(const char* a, const char (&b)[length])
  858. {
  859. for (size_t i = 0; i < length; ++i) {
  860. if (a[i] != b[i])
  861. return false;
  862. }
  863. return true;
  864. }
  865. template<size_t lengthB>
  866. static inline bool equal(const char* stringA, size_t lengthA, const char (&stringB)[lengthB])
  867. {
  868. return lengthA == lengthB && equal(stringA, stringB);
  869. }
  870. // List of default schemes is taken from google-url:
  871. // http://code.google.com/p/google-url/source/browse/trunk/src/url_canon_stdurl.cc#120
  872. static inline bool isDefaultPortForScheme(const char* port, size_t portLength, const char* scheme, size_t schemeLength)
  873. {
  874. // This switch is theoretically a performance optimization. It came over when
  875. // the code was moved from google-url, but may be removed later.
  876. switch (schemeLength) {
  877. case 2:
  878. return equal(scheme, wsScheme) && equal(port, portLength, httpPort);
  879. case 3:
  880. if (equal(scheme, ftpScheme))
  881. return equal(port, portLength, ftpPort);
  882. if (equal(scheme, wssScheme))
  883. return equal(port, portLength, httpsPort);
  884. break;
  885. case 4:
  886. return equal(scheme, httpScheme) && equal(port, portLength, httpPort);
  887. case 5:
  888. return equal(scheme, httpsScheme) && equal(port, portLength, httpsPort);
  889. case 6:
  890. return equal(scheme, gopherScheme) && equal(port, portLength, gopherPort);
  891. }
  892. return false;
  893. }
  894. static inline bool hostPortIsEmptyButCredentialsArePresent(int hostStart, int portEnd, char userinfoEndChar)
  895. {
  896. return userinfoEndChar == '@' && hostStart == portEnd;
  897. }
  898. static bool isNonFileHierarchicalScheme(const char* scheme, size_t schemeLength)
  899. {
  900. switch (schemeLength) {
  901. case 2:
  902. return equal(scheme, wsScheme);
  903. case 3:
  904. return equal(scheme, ftpScheme) || equal(scheme, wssScheme);
  905. case 4:
  906. return equal(scheme, httpScheme);
  907. case 5:
  908. return equal(scheme, httpsScheme);
  909. case 6:
  910. return equal(scheme, gopherScheme);
  911. }
  912. return false;
  913. }
  914. static bool isCanonicalHostnameLowercaseForScheme(const char* scheme, size_t schemeLength)
  915. {
  916. switch (schemeLength) {
  917. case 2:
  918. return equal(scheme, wsScheme);
  919. case 3:
  920. return equal(scheme, ftpScheme) || equal(scheme, wssScheme);
  921. case 4:
  922. return equal(scheme, httpScheme) || equal(scheme, fileScheme);
  923. case 5:
  924. return equal(scheme, httpsScheme);
  925. case 6:
  926. return equal(scheme, gopherScheme);
  927. }
  928. return false;
  929. }
  930. void KURL::parse(const char* url, const String* originalString)
  931. {
  932. if (!url || url[0] == '\0') {
  933. // valid URL must be non-empty
  934. m_string = originalString ? *originalString : url;
  935. invalidate();
  936. return;
  937. }
  938. if (!isSchemeFirstChar(url[0])) {
  939. // scheme must start with an alphabetic character
  940. m_string = originalString ? *originalString : url;
  941. invalidate();
  942. return;
  943. }
  944. int schemeEnd = 0;
  945. while (isSchemeChar(url[schemeEnd]))
  946. schemeEnd++;
  947. if (url[schemeEnd] != ':') {
  948. m_string = originalString ? *originalString : url;
  949. invalidate();
  950. return;
  951. }
  952. int userStart = schemeEnd + 1;
  953. int userEnd;
  954. int passwordStart;
  955. int passwordEnd;
  956. int hostStart;
  957. int hostEnd;
  958. int portStart;
  959. int portEnd;
  960. bool hierarchical = url[schemeEnd + 1] == '/';
  961. bool hasSecondSlash = hierarchical && url[schemeEnd + 2] == '/';
  962. bool isFile = schemeEnd == 4
  963. && isLetterMatchIgnoringCase(url[0], 'f')
  964. && isLetterMatchIgnoringCase(url[1], 'i')
  965. && isLetterMatchIgnoringCase(url[2], 'l')
  966. && isLetterMatchIgnoringCase(url[3], 'e');
  967. #if PLATFORM(BLACKBERRY)
  968. // Parse local: urls the same as file: urls.
  969. if (!isFile)
  970. isFile = schemeEnd == 5
  971. && isLetterMatchIgnoringCase(url[0], 'l')
  972. && isLetterMatchIgnoringCase(url[1], 'o')
  973. && isLetterMatchIgnoringCase(url[2], 'c')
  974. && isLetterMatchIgnoringCase(url[3], 'a')
  975. && isLetterMatchIgnoringCase(url[4], 'l');
  976. #endif
  977. m_protocolIsInHTTPFamily = isLetterMatchIgnoringCase(url[0], 'h')
  978. && isLetterMatchIgnoringCase(url[1], 't')
  979. && isLetterMatchIgnoringCase(url[2], 't')
  980. && isLetterMatchIgnoringCase(url[3], 'p')
  981. && (url[4] == ':' || (isLetterMatchIgnoringCase(url[4], 's') && url[5] == ':'));
  982. if ((hierarchical && hasSecondSlash) || isNonFileHierarchicalScheme(url, schemeEnd)) {
  983. // The part after the scheme is either a net_path or an abs_path whose first path segment is empty.
  984. // Attempt to find an authority.
  985. // FIXME: Authority characters may be scanned twice, and it would be nice to be faster.
  986. if (hierarchical)
  987. userStart++;
  988. if (hasSecondSlash)
  989. userStart++;
  990. userEnd = userStart;
  991. int colonPos = 0;
  992. while (isUserInfoChar(url[userEnd])) {
  993. if (url[userEnd] == ':' && colonPos == 0)
  994. colonPos = userEnd;
  995. userEnd++;
  996. }
  997. if (url[userEnd] == '@') {
  998. // actual end of the userinfo, start on the host
  999. if (colonPos != 0) {
  1000. passwordEnd = userEnd;
  1001. userEnd = colonPos;
  1002. passwordStart = colonPos + 1;
  1003. } else
  1004. passwordStart = passwordEnd = userEnd;
  1005. hostStart = passwordEnd + 1;
  1006. } else if (url[userEnd] == '[' || isPathSegmentEndChar(url[userEnd])) {
  1007. // hit the end of the authority, must have been no user
  1008. // or looks like an IPv6 hostname
  1009. // either way, try to parse it as a hostname
  1010. userEnd = userStart;
  1011. passwordStart = passwordEnd = userEnd;
  1012. hostStart = userStart;
  1013. } else {
  1014. // invalid character
  1015. m_string = originalString ? *originalString : url;
  1016. invalidate();
  1017. return;
  1018. }
  1019. hostEnd = hostStart;
  1020. // IPV6 IP address
  1021. if (url[hostEnd] == '[') {
  1022. hostEnd++;
  1023. while (isIPv6Char(url[hostEnd]))
  1024. hostEnd++;
  1025. if (url[hostEnd] == ']')
  1026. hostEnd++;
  1027. else {
  1028. // invalid character
  1029. m_string = originalString ? *originalString : url;
  1030. invalidate();
  1031. return;
  1032. }
  1033. } else {
  1034. while (isHostnameChar(url[hostEnd]))
  1035. hostEnd++;
  1036. }
  1037. if (url[hostEnd] == ':') {
  1038. portStart = portEnd = hostEnd + 1;
  1039. // possible start of port
  1040. portEnd = portStart;
  1041. while (isASCIIDigit(url[portEnd]))
  1042. portEnd++;
  1043. } else
  1044. portStart = portEnd = hostEnd;
  1045. if (!isPathSegmentEndChar(url[portEnd])) {
  1046. // invalid character
  1047. m_string = originalString ? *originalString : url;
  1048. invalidate();
  1049. return;
  1050. }
  1051. if (hostPortIsEmptyButCredentialsArePresent(hostStart, portEnd, url[passwordEnd])) {
  1052. m_string = originalString ? *originalString : url;
  1053. invalidate();
  1054. return;
  1055. }
  1056. if (userStart == portEnd && !m_protocolIsInHTTPFamily && !isFile) {
  1057. // No authority found, which means that this is not a net_path, but rather an abs_path whose first two
  1058. // path segments are empty. For file, http and https only, an empty authority is allowed.
  1059. userStart -= 2;
  1060. userEnd = userStart;
  1061. passwordStart = userEnd;
  1062. passwordEnd = passwordStart;
  1063. hostStart = passwordEnd;
  1064. hostEnd = hostStart;
  1065. portStart = hostEnd;
  1066. portEnd = hostEnd;
  1067. }
  1068. } else {
  1069. // the part after the scheme must be an opaque_part or an abs_path
  1070. userEnd = userStart;
  1071. passwordStart = passwordEnd = userEnd;
  1072. hostStart = hostEnd = passwordEnd;
  1073. portStart = portEnd = hostEnd;
  1074. }
  1075. int pathStart = portEnd;
  1076. int pathEnd = pathStart;
  1077. while (url[pathEnd] && url[pathEnd] != '?' && url[pathEnd] != '#')
  1078. pathEnd++;
  1079. int queryStart = pathEnd;
  1080. int queryEnd = queryStart;
  1081. if (url[queryStart] == '?') {
  1082. while (url[queryEnd] && url[queryEnd] != '#')
  1083. queryEnd++;
  1084. }
  1085. int fragmentStart = queryEnd;
  1086. int fragmentEnd = fragmentStart;
  1087. if (url[fragmentStart] == '#') {
  1088. fragmentStart++;
  1089. fragmentEnd = fragmentStart;
  1090. while (url[fragmentEnd])
  1091. fragmentEnd++;
  1092. }
  1093. // assemble it all, remembering the real ranges
  1094. Vector<char, 4096> buffer(fragmentEnd * 3 + 1);
  1095. char *p = buffer.data();
  1096. const char *strPtr = url;
  1097. // copy in the scheme
  1098. const char *schemeEndPtr = url + schemeEnd;
  1099. while (strPtr < schemeEndPtr)
  1100. *p++ = toASCIILower(*strPtr++);
  1101. m_schemeEnd = p - buffer.data();
  1102. bool hostIsLocalHost = portEnd - userStart == 9
  1103. && isLetterMatchIgnoringCase(url[userStart], 'l')
  1104. && isLetterMatchIgnoringCase(url[userStart+1], 'o')
  1105. && isLetterMatchIgnoringCase(url[userStart+2], 'c')
  1106. && isLetterMatchIgnoringCase(url[userStart+3], 'a')
  1107. && isLetterMatchIgnoringCase(url[userStart+4], 'l')
  1108. && isLetterMatchIgnoringCase(url[userStart+5], 'h')
  1109. && isLetterMatchIgnoringCase(url[userStart+6], 'o')
  1110. && isLetterMatchIgnoringCase(url[userStart+7], 's')
  1111. && isLetterMatchIgnoringCase(url[userStart+8], 't');
  1112. // File URLs need a host part unless it is just file:// or file://localhost
  1113. bool degenerateFilePath = pathStart == pathEnd && (hostStart == hostEnd || hostIsLocalHost);
  1114. // We drop empty credentials, but keep a colon in an empty host/port pair.
  1115. // Removing hostname completely would change the structure of the URL on re-parsing.
  1116. bool haveNonHostAuthorityPart = userStart != userEnd || passwordStart != passwordEnd || hostEnd != portEnd;
  1117. // add ":" after scheme
  1118. *p++ = ':';
  1119. // if we have at least one authority part or a file URL - add "//" and authority
  1120. if (isFile ? !degenerateFilePath : (haveNonHostAuthorityPart || hostStart != hostEnd)) {
  1121. *p++ = '/';
  1122. *p++ = '/';
  1123. m_userStart = p - buffer.data();
  1124. // copy in the user
  1125. strPtr = url + userStart;
  1126. const char* userEndPtr = url + userEnd;
  1127. while (strPtr < userEndPtr) {
  1128. char c = *strPtr++;
  1129. ASSERT(isUserInfoChar(c));
  1130. *p++ = c;
  1131. }
  1132. m_userEnd = p - buffer.data();
  1133. // copy in the password
  1134. if (passwordEnd != passwordStart) {
  1135. *p++ = ':';
  1136. strPtr = url + passwordStart;
  1137. const char* passwordEndPtr = url + passwordEnd;
  1138. while (strPtr < passwordEndPtr) {
  1139. char c = *strPtr++;
  1140. ASSERT(isUserInfoChar(c));
  1141. *p++ = c;
  1142. }
  1143. }
  1144. m_passwordEnd = p - buffer.data();
  1145. // If we had any user info, add "@"
  1146. if (p - buffer.data() != m_userStart)
  1147. *p++ = '@';
  1148. // copy in the host, except in the case of a file URL with authority="localhost"
  1149. if (!(isFile && hostIsLocalHost && !haveNonHostAuthorityPart)) {
  1150. strPtr = url + hostStart;
  1151. const char* hostEndPtr = url + hostEnd;
  1152. if (isCanonicalHostnameLowercaseForScheme(buffer.data(), m_schemeEnd)) {
  1153. while (strPtr < hostEndPtr) {
  1154. char c = toASCIILower(*strPtr++);
  1155. ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':');
  1156. *p++ = c;
  1157. }
  1158. } else {
  1159. while (strPtr < hostEndPtr) {
  1160. char c = *strPtr++;
  1161. ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':');
  1162. *p++ = c;
  1163. }
  1164. }
  1165. }
  1166. m_hostEnd = p - buffer.data();
  1167. // Copy in the port if the URL has one (and it's not default). Also, copy it if there was no hostname, so that there is still something in authority component.
  1168. if (hostEnd != portStart) {
  1169. const char* portStr = url + portStart;
  1170. size_t portLength = portEnd - portStart;
  1171. if ((portLength && !isDefaultPortForScheme(portStr, portLength, buffer.data(), m_schemeEnd))
  1172. || (hostStart == hostEnd && hostEnd != portStart)) {
  1173. *p++ = ':';
  1174. const char* portEndPtr = url + portEnd;
  1175. while (portStr < portEndPtr)
  1176. *p++ = *portStr++;
  1177. }
  1178. }
  1179. m_portEnd = p - buffer.data();
  1180. } else {
  1181. if (isFile) {
  1182. ASSERT(degenerateFilePath);
  1183. *p++ = '/';
  1184. *p++ = '/';
  1185. }
  1186. m_userStart = m_userEnd = m_passwordEnd = m_hostEnd = m_portEnd = p - buffer.data();
  1187. }
  1188. // For canonicalization, ensure we have a '/' for no path.
  1189. // Do this only for URL with protocol file, http or https.
  1190. if ((m_protocolIsInHTTPFamily || isFile) && pathEnd == pathStart)
  1191. *p++ = '/';
  1192. // add path, escaping bad characters
  1193. if (!hierarchical)
  1194. escapeAndAppendNonHierarchicalPart(p, url + pathStart, pathEnd - pathStart);
  1195. else if (!hasSlashDotOrDotDot(url))
  1196. appendEscapingBadChars(p, url + pathStart, pathEnd - pathStart);
  1197. else {
  1198. CharBuffer pathBuffer(pathEnd - pathStart + 1);
  1199. size_t length = copyPathRemovingDots(pathBuffer.data(), url, pathStart, pathEnd);
  1200. appendEscapingBadChars(p, pathBuffer.data(), length);
  1201. }
  1202. m_pathEnd = p - buffer.data();
  1203. // Find the position after the last slash in the path, or
  1204. // the position before the path if there are no slashes in it.
  1205. int i;
  1206. for (i = m_pathEnd; i > m_portEnd; --i) {
  1207. if (buffer[i - 1] == '/')
  1208. break;
  1209. }
  1210. m_pathAfterLastSlash = i;
  1211. // add query, escaping bad characters
  1212. appendEscapingBadChars(p, url + queryStart, queryEnd - queryStart);
  1213. m_queryEnd = p - buffer.data();
  1214. // add fragment, escaping bad characters
  1215. if (fragmentEnd != queryEnd) {
  1216. *p++ = '#';
  1217. escapeAndAppendNonHierarchicalPart(p, url + fragmentStart, fragmentEnd - fragmentStart);
  1218. }
  1219. m_fragmentEnd = p - buffer.data();
  1220. ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
  1221. ASSERT(buffer.size() > 0);
  1222. // If we didn't end up actually changing the original string and
  1223. // it was already in a String, reuse it to avoid extra allocation.
  1224. if (originalString && equal(originalString->impl(), buffer.data(), m_fragmentEnd))
  1225. m_string = *originalString;
  1226. else
  1227. m_string = String(buffer.data(), m_fragmentEnd);
  1228. m_isValid = true;
  1229. }
  1230. bool equalIgnoringFragmentIdentifier(const KURL& a, const KURL& b)
  1231. {
  1232. if (a.m_queryEnd != b.m_queryEnd)
  1233. return false;
  1234. unsigned queryLength = a.m_queryEnd;
  1235. for (unsigned i = 0; i < queryLength; ++i)
  1236. if (a.string()[i] != b.string()[i])
  1237. return false;
  1238. return true;
  1239. }
  1240. bool protocolHostAndPortAreEqual(const KURL& a, const KURL& b)
  1241. {
  1242. if (a.m_schemeEnd != b.m_schemeEnd)
  1243. return false;
  1244. int hostStartA = a.hostStart();
  1245. int hostLengthA = a.hostEnd() - hostStartA;
  1246. int hostStartB = b.hostStart();
  1247. int hostLengthB = b.hostEnd() - b.hostStart();
  1248. if (hostLengthA != hostLengthB)
  1249. return false;
  1250. // Check the scheme
  1251. for (int i = 0; i < a.m_schemeEnd; ++i)
  1252. if (a.string()[i] != b.string()[i])
  1253. return false;
  1254. // And the host
  1255. for (int i = 0; i < hostLengthA; ++i)
  1256. if (a.string()[hostStartA + i] != b.string()[hostStartB + i])
  1257. return false;
  1258. if (a.port() != b.port())
  1259. return false;
  1260. return true;
  1261. }
  1262. String encodeWithURLEscapeSequences(const String& notEncodedString)
  1263. {
  1264. CString asUTF8 = notEncodedString.utf8();
  1265. CharBuffer buffer(asUTF8.length() * 3 + 1);
  1266. char* p = buffer.data();
  1267. const char* str = asUTF8.data();
  1268. const char* strEnd = str + asUTF8.length();
  1269. while (str < strEnd) {
  1270. unsigned char c = *str++;
  1271. if (isBadChar(c))
  1272. appendEscapedChar(p, c);
  1273. else
  1274. *p++ = c;
  1275. }
  1276. ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
  1277. return String(buffer.data(), p - buffer.data());
  1278. }
  1279. // Appends the punycoded hostname identified by the given string and length to
  1280. // the output buffer. The result will not be null terminated.
  1281. static void appendEncodedHostname(UCharBuffer& buffer, const UChar* str, unsigned strLen)
  1282. {
  1283. // Needs to be big enough to hold an IDN-encoded name.
  1284. // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
  1285. const unsigned hostnameBufferLength = 2048;
  1286. if (strLen > hostnameBufferLength || charactersAreAllASCII(str, strLen)) {
  1287. buffer.append(str, strLen);
  1288. return;
  1289. }
  1290. #if USE(ICU_UNICODE)
  1291. UChar hostnameBuffer[hostnameBufferLength];
  1292. UErrorCode error = U_ZERO_ERROR;
  1293. int32_t numCharactersConverted = uidna_IDNToASCII(str, strLen, hostnameBuffer,
  1294. hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, 0, &error);
  1295. if (error == U_ZERO_ERROR)
  1296. buffer.append(hostnameBuffer, numCharactersConverted);
  1297. #endif
  1298. }
  1299. static void findHostnamesInMailToURL(const UChar* str, int strLen, Vector<pair<int, int> >& nameRanges)
  1300. {
  1301. // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' or end of string character.
  1302. // Skip quoted strings so that characters in them don't confuse us.
  1303. // When we find a '?' character, we are past the part of the URL that contains host names.
  1304. nameRanges.clear();
  1305. int p = 0;
  1306. while (1) {
  1307. // Find start of host name or of quoted string.
  1308. int hostnameOrStringStart = findFirstOf(str, strLen, p, "\"@?");
  1309. if (hostnameOrStringStart == -1)
  1310. return;
  1311. UChar c = str[hostnameOrStringStart];
  1312. p = hostnameOrStringStart + 1;
  1313. if (c == '?')
  1314. return;
  1315. if (c == '@') {
  1316. // Find end of host name.
  1317. int hostnameStart = p;
  1318. int hostnameEnd = findFirstOf(str, strLen, p, ">,?");
  1319. bool done;
  1320. if (hostnameEnd == -1) {
  1321. hostnameEnd = strLen;
  1322. done = true;
  1323. } else {
  1324. p = hostnameEnd;
  1325. done = false;
  1326. }
  1327. nameRanges.append(make_pair(hostnameStart, hostnameEnd));
  1328. if (done)
  1329. return;
  1330. } else {
  1331. // Skip quoted string.
  1332. ASSERT(c == '"');
  1333. while (1) {
  1334. int escapedCharacterOrStringEnd = findFirstOf(str, strLen, p, "\"\\");
  1335. if (escapedCharacterOrStringEnd == -1)
  1336. return;
  1337. c = str[escapedCharacterOrStringEnd];
  1338. p = escapedCharacterOrStringEnd + 1;
  1339. // If we are the end of the string, then break from the string loop back to the host name loop.
  1340. if (c == '"')
  1341. break;
  1342. // Skip escaped character.
  1343. ASSERT(c == '\\');
  1344. if (p == strLen)
  1345. return;
  1346. ++p;
  1347. }
  1348. }
  1349. }
  1350. }
  1351. static bool findHostnameInHierarchicalURL(const UChar* str, int strLen, int& startOffset, int& endOffset)
  1352. {
  1353. // Find the host name in a hierarchical URL.
  1354. // It comes after a "://" sequence, with scheme characters preceding, and
  1355. // this should be the first colon in the string.
  1356. // It ends with the end of the string or a ":" or a path segment ending character.
  1357. // If there is a "@" character, the host part is just the part after the "@".
  1358. int separator = findFirstOf(str, strLen, 0, ":");
  1359. if (separator == -1 || separator + 2 >= strLen ||
  1360. str[separator + 1] != '/' || str[separator + 2] != '/')
  1361. return false;
  1362. // Check that all characters before the :// are valid scheme characters.
  1363. if (!isSchemeFirstChar(str[0]))
  1364. return false;
  1365. for (int i = 1; i < separator; ++i) {
  1366. if (!isSchemeChar(str[i]))
  1367. return false;
  1368. }
  1369. // Start after the separator.
  1370. int authorityStart = separator + 3;
  1371. // Find terminating character.
  1372. int hostnameEnd = strLen;
  1373. for (int i = authorityStart; i < strLen; ++i) {
  1374. UChar c = str[i];
  1375. if (c == ':' || (isPathSegmentEndChar(c) && c != 0)) {
  1376. hostnameEnd = i;
  1377. break;
  1378. }
  1379. }
  1380. // Find "@" for the start of the host name.
  1381. int userInfoTerminator = findFirstOf(str, strLen, authorityStart, "@");
  1382. int hostnameStart;
  1383. if (userInfoTerminator == -1 || userInfoTerminator > hostnameEnd)
  1384. hostnameStart = authorityStart;
  1385. else
  1386. hostnameStart = userInfoTerminator + 1;
  1387. startOffset = hostnameStart;
  1388. endOffset = hostnameEnd;
  1389. return true;
  1390. }
  1391. // Converts all hostnames found in the given input to punycode, preserving the
  1392. // rest of the URL unchanged. The output will NOT be null-terminated.
  1393. static void encodeHostnames(const String& str, UCharBuffer& output)
  1394. {
  1395. output.clear();
  1396. if (protocolIs(str, "mailto")) {
  1397. Vector<pair<int, int> > hostnameRanges;
  1398. findHostnamesInMailToURL(str.characters(), str.length(), hostnameRanges);
  1399. int n = hostnameRanges.size();
  1400. int p = 0;
  1401. for (int i = 0; i < n; ++i) {
  1402. const pair<int, int>& r = hostnameRanges[i];
  1403. output.append(&str.characters()[p], r.first - p);
  1404. appendEncodedHostname(output, &str.characters()[r.first], r.second - r.first);
  1405. p = r.second;
  1406. }
  1407. // This will copy either everything after the last hostname, or the
  1408. // whole thing if there is no hostname.
  1409. output.append(&str.characters()[p], str.length() - p);
  1410. } else {
  1411. int hostStart, hostEnd;
  1412. if (findHostnameInHierarchicalURL(str.characters(), str.length(), hostStart, hostEnd)) {
  1413. output.append(str.characters(), hostStart); // Before hostname.
  1414. appendEncodedHostname(output, &str.characters()[hostStart], hostEnd - hostStart);
  1415. output.append(&str.characters()[hostEnd], str.length() - hostEnd); // After hostname.
  1416. } else {
  1417. // No hostname to encode, return the input.
  1418. output.append(str.characters(), str.length());
  1419. }
  1420. }
  1421. }
  1422. static void encodeRelativeString(const String& rel, const TextEncoding& encoding, CharBuffer& output)
  1423. {
  1424. UCharBuffer s;
  1425. encodeHostnames(rel, s);
  1426. TextEncoding pathEncoding(UTF8Encoding()); // Path is always encoded as UTF-8; other parts may depend on the scheme.
  1427. int pathEnd = -1;
  1428. if (encoding != pathEncoding && encoding.isValid() && !protocolIs(rel, "mailto") && !protocolIs(rel, "data") && !protocolIsJavaScript(rel)) {
  1429. // Find the first instance of either # or ?, keep pathEnd at -1 otherwise.
  1430. pathEnd = findFirstOf(s.data(), s.size(), 0, "#?");
  1431. }
  1432. if (pathEnd == -1) {
  1433. CString decoded = pathEncoding.encode(s.data(), s.size(), URLEncodedEntitiesForUnencodables);
  1434. output.resize(decoded.length());
  1435. memcpy(output.data(), decoded.data(), decoded.length());
  1436. } else {
  1437. CString pathDecoded = pathEncoding.encode(s.data(), pathEnd, URLEncodedEntitiesForUnencodables);
  1438. // Unencodable characters in URLs are represented by converting
  1439. // them to XML entities and escaping non-alphanumeric characters.
  1440. CString otherDecoded = encoding.encode(s.data() + pathEnd, s.size() - pathEnd, URLEncodedEntitiesForUnencodables);
  1441. output.resize(pathDecoded.length() + otherDecoded.length());
  1442. memcpy(output.data(), pathDecoded.data(), pathDecoded.length());
  1443. memcpy(output.data() + pathDecoded.length(), otherDecoded.data(), otherDecoded.length());
  1444. }
  1445. output.append('\0'); // null-terminate the output.
  1446. }
  1447. static String substituteBackslashes(const String& string)
  1448. {
  1449. size_t questionPos = string.find('?');
  1450. size_t hashPos = string.find('#');
  1451. unsigned pathEnd;
  1452. if (hashPos != notFound && (questionPos == notFound || questionPos > hashPos))
  1453. pathEnd = hashPos;
  1454. else if (questionPos != notFound)
  1455. pathEnd = questionPos;
  1456. else
  1457. pathEnd = string.length();
  1458. return string.left(pathEnd).replace('\\','/') + string.substring(pathEnd);
  1459. }
  1460. bool KURL::isHierarchical() const
  1461. {
  1462. if (!m_isValid)
  1463. return false;
  1464. ASSERT(m_string[m_schemeEnd] == ':');
  1465. return m_string[m_schemeEnd + 1] == '/';
  1466. }
  1467. void KURL::copyToBuffer(Vector<char, 512>& buffer) const
  1468. {
  1469. // FIXME: This throws away the high bytes of all the characters in the string!
  1470. // That's fine for a valid URL, which is all ASCII, but not for invalid URLs.
  1471. buffer.resize(m_string.length());
  1472. copyASCII(m_string, buffer.data());
  1473. }
  1474. bool protocolIs(const String& url, const char* protocol)
  1475. {
  1476. // Do the comparison without making a new string object.
  1477. assertProtocolIsGood(protocol);
  1478. for (int i = 0; ; ++i) {
  1479. if (!protocol[i])
  1480. return url[i] == ':';
  1481. if (!isLetterMatchIgnoringCase(url[i], protocol[i]))
  1482. return false;
  1483. }
  1484. }
  1485. bool isValidProtocol(const String& protocol)
  1486. {
  1487. // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
  1488. if (protocol.isEmpty())
  1489. return false;
  1490. if (!isSchemeFirstChar(protocol[0]))
  1491. return false;
  1492. unsigned protocolLength = protocol.length();
  1493. for (unsigned i = 1; i < protocolLength; i++) {
  1494. if (!isSchemeChar(protocol[i]))
  1495. return false;
  1496. }
  1497. return true;
  1498. }
  1499. #ifndef NDEBUG
  1500. void KURL::print() const
  1501. {
  1502. printf("%s\n", m_string.utf8().data());
  1503. }
  1504. #endif
  1505. String KURL::strippedForUseAsReferrer() const
  1506. {
  1507. KURL referrer(*this);
  1508. referrer.setUser(String());
  1509. referrer.setPass(String());
  1510. referrer.removeFragmentIdentifier();
  1511. return referrer.string();
  1512. }
  1513. bool KURL::isLocalFile() const
  1514. {
  1515. // Including feed here might be a bad idea since drag and drop uses this check
  1516. // and including feed would allow feeds to potentially let someone's blog
  1517. // read the contents of the clipboard on a drag, even without a drop.
  1518. // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function.
  1519. return protocolIs("file");
  1520. }
  1521. bool protocolIsJavaScript(const String& url)
  1522. {
  1523. return protocolIs(url, "javascript");
  1524. }
  1525. const KURL& blankURL()
  1526. {
  1527. DEFINE_STATIC_LOCAL(KURL, staticBlankURL, (ParsedURLString, "about:blank"));
  1528. return staticBlankURL;
  1529. }
  1530. bool KURL::isBlankURL() const
  1531. {
  1532. return protocolIs("about");
  1533. }
  1534. bool isDefaultPortForProtocol(unsigned short port, const String& protocol)
  1535. {
  1536. if (protocol.isEmpty())
  1537. return false;
  1538. typedef HashMap<String, unsigned, CaseFoldingHash> DefaultPortsMap;
  1539. DEFINE_STATIC_LOCAL(DefaultPortsMap, defaultPorts, ());
  1540. if (defaultPorts.isEmpty()) {
  1541. defaultPorts.set("http", 80);
  1542. defaultPorts.set("https", 443);
  1543. defaultPorts.set("ftp", 21);
  1544. defaultPorts.set("ftps", 990);
  1545. }
  1546. return defaultPorts.get(protocol) == port;
  1547. }
  1548. bool portAllowed(const KURL& url)
  1549. {
  1550. unsigned short port = url.port();
  1551. // Since most URLs don't have a port, return early for the "no port" case.
  1552. if (!port)
  1553. return true;
  1554. // This blocked port list matches the port blocking that Mozilla implements.
  1555. // See http://www.mozilla.org/projects/netlib/PortBanning.html for more information.
  1556. static const unsigned short blockedPortList[] = {
  1557. 1, // tcpmux
  1558. 7, // echo
  1559. 9, // discard
  1560. 11, // systat
  1561. 13, // daytime
  1562. 15, // netstat
  1563. 17, // qotd
  1564. 19, // chargen
  1565. 20, // FTP-data
  1566. 21, // FTP-control
  1567. 22, // SSH
  1568. 23, // telnet
  1569. 25, // SMTP
  1570. 37, // time
  1571. 42, // name
  1572. 43, // nicname
  1573. 53, // domain
  1574. 77, // priv-rjs
  1575. 79, // finger
  1576. 87, // ttylink
  1577. 95, // supdup
  1578. 101, // hostriame
  1579. 102, // iso-tsap
  1580. 103, // gppitnp
  1581. 104, // acr-nema
  1582. 109, // POP2
  1583. 110, // POP3
  1584. 111, // sunrpc
  1585. 113, // auth
  1586. 115, // SFTP
  1587. 117, // uucp-path
  1588. 119, // nntp
  1589. 123, // NTP
  1590. 135, // loc-srv / epmap
  1591. 139, // netbios
  1592. 143, // IMAP2
  1593. 179, // BGP
  1594. 389, // LDAP
  1595. 465, // SMTP+SSL
  1596. 512, // print / exec
  1597. 513, // login
  1598. 514, // shell
  1599. 515, // printer
  1600. 526, // tempo
  1601. 530, // courier
  1602. 531, // Chat
  1603. 532, // netnews
  1604. 540, // UUCP
  1605. 556, // remotefs
  1606. 563, // NNTP+SSL
  1607. 587, // ESMTP
  1608. 601, // syslog-conn
  1609. 636, // LDAP+SSL
  1610. 993, // IMAP+SSL
  1611. 995, // POP3+SSL
  1612. 2049, // NFS
  1613. 3659, // apple-sasl / PasswordServer [Apple addition]
  1614. 4045, // lockd
  1615. 6000, // X11
  1616. 6665, // Alternate IRC [Apple addition]
  1617. 6666, // Alternate IRC [Apple addition]
  1618. 6667, // Standard IRC [Apple addition]
  1619. 6668, // Alternate IRC [Apple addition]
  1620. 6669, // Alternate IRC [Apple addition]
  1621. invalidPortNumber, // Used to block all invalid port numbers
  1622. };
  1623. const unsigned short* const blockedPortListEnd = blockedPortList + WTF_ARRAY_LENGTH(blockedPortList);
  1624. #ifndef NDEBUG
  1625. // The port list must be sorted for binary_search to work.
  1626. static bool checkedPortList = false;
  1627. if (!checkedPortList) {
  1628. for (const unsigned short* p = blockedPortList; p != blockedPortListEnd - 1; ++p)
  1629. ASSERT(*p < *(p + 1));
  1630. checkedPortList = true;
  1631. }
  1632. #endif
  1633. // If the port is not in the blocked port list, allow it.
  1634. if (!binary_search(blockedPortList, blockedPortListEnd, port))
  1635. return true;
  1636. // Allow ports 21 and 22 for FTP URLs, as Mozilla does.
  1637. if ((port == 21 || port == 22) && url.protocolIs("ftp"))
  1638. return true;
  1639. // Allow any port number in a file URL, since the port number is ignored.
  1640. if (url.protocolIs("file"))
  1641. return true;
  1642. #if PLATFORM(BLACKBERRY)
  1643. if (url.protocolIs("local"))
  1644. return true;
  1645. #endif
  1646. return false;
  1647. }
  1648. String mimeTypeFromDataURL(const String& url)
  1649. {
  1650. ASSERT(protocolIs(url, "data"));
  1651. size_t index = url.find(';');
  1652. if (index == notFound)
  1653. index = url.find(',');
  1654. if (index != notFound) {
  1655. if (index > 5)
  1656. return url.substring(5, index - 5).lower();
  1657. return "text/plain"; // Data URLs with no MIME type are considered text/plain.
  1658. }
  1659. return "";
  1660. }
  1661. String mimeTypeFromURL(const KURL& url)
  1662. {
  1663. String decodedPath = decodeURLEscapeSequences(url.path());
  1664. String extension = decodedPath.substring(decodedPath.reverseFind('.') + 1);
  1665. // We don't use MIMETypeRegistry::getMIMETypeForPath() because it returns "application/octet-stream" upon failure
  1666. return MIMETypeRegistry::getMIMETypeForExtension(extension);
  1667. }
  1668. bool KURL::isSafeToSendToAnotherThread() const
  1669. {
  1670. return m_string.isSafeToSendToAnotherThread();
  1671. }
  1672. String KURL::stringCenterEllipsizedToLength(unsigned length) const
  1673. {
  1674. if (string().length() <= length)
  1675. return string();
  1676. return string().left(length / 2 - 1) + "..." + string().right(length / 2 - 2);
  1677. }
  1678. }