uri.nim 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2015 Dominik Picheta
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements URI parsing as specified by RFC 3986.
  10. ##
  11. ## A Uniform Resource Identifier (URI) provides a simple and extensible
  12. ## means for identifying a resource. A URI can be further classified
  13. ## as a locator, a name, or both. The term "Uniform Resource Locator"
  14. ## (URL) refers to the subset of URIs.
  15. ##
  16. ## .. warning:: URI parsers in this module do not perform security validation.
  17. ##
  18. ## # Basic usage
  19. ## ## Combine URIs
  20. runnableExamples:
  21. let host = parseUri("https://nim-lang.org")
  22. assert $host == "https://nim-lang.org"
  23. assert $(host / "/blog.html") == "https://nim-lang.org/blog.html"
  24. assert $(host / "blog2.html") == "https://nim-lang.org/blog2.html"
  25. ## ## Access URI item
  26. runnableExamples:
  27. let res = parseUri("sftp://127.0.0.1:4343")
  28. assert isAbsolute(res)
  29. assert res.port == "4343"
  30. ## ## Data URI Base64
  31. runnableExamples:
  32. assert getDataUri("Hello World", "text/plain") == "data:text/plain;charset=utf-8;base64,SGVsbG8gV29ybGQ="
  33. assert getDataUri("Nim", "text/plain") == "data:text/plain;charset=utf-8;base64,Tmlt"
  34. import std/[strutils, parseutils, base64]
  35. import std/private/[since, decode_helpers]
  36. when defined(nimPreviewSlimSystem):
  37. import std/assertions
  38. type
  39. Url* = distinct string
  40. Uri* = object
  41. scheme*, username*, password*: string
  42. hostname*, port*, path*, query*, anchor*: string
  43. opaque*: bool
  44. isIpv6*: bool
  45. UriParseError* = object of ValueError
  46. proc uriParseError*(msg: string) {.noreturn.} =
  47. ## Raises a `UriParseError` exception with message `msg`.
  48. raise newException(UriParseError, msg)
  49. func encodeUrl*(s: string, usePlus = true): string =
  50. ## Encodes a URL according to RFC3986.
  51. ##
  52. ## This means that characters in the set
  53. ## `{'a'..'z', 'A'..'Z', '0'..'9', '-', '.', '_', '~'}` are
  54. ## carried over to the result.
  55. ## All other characters are encoded as `%xx` where `xx`
  56. ## denotes its hexadecimal value.
  57. ##
  58. ## As a special rule, when the value of `usePlus` is true,
  59. ## spaces are encoded as `+` instead of `%20`.
  60. ##
  61. ## **See also:**
  62. ## * `decodeUrl func<#decodeUrl,string>`_
  63. runnableExamples:
  64. assert encodeUrl("https://nim-lang.org") == "https%3A%2F%2Fnim-lang.org"
  65. assert encodeUrl("https://nim-lang.org/this is a test") == "https%3A%2F%2Fnim-lang.org%2Fthis+is+a+test"
  66. assert encodeUrl("https://nim-lang.org/this is a test", false) == "https%3A%2F%2Fnim-lang.org%2Fthis%20is%20a%20test"
  67. result = newStringOfCap(s.len + s.len shr 2) # assume 12% non-alnum-chars
  68. let fromSpace = if usePlus: "+" else: "%20"
  69. for c in s:
  70. case c
  71. # https://tools.ietf.org/html/rfc3986#section-2.3
  72. of 'a'..'z', 'A'..'Z', '0'..'9', '-', '.', '_', '~': add(result, c)
  73. of ' ': add(result, fromSpace)
  74. else:
  75. add(result, '%')
  76. add(result, toHex(ord(c), 2))
  77. func decodeUrl*(s: string, decodePlus = true): string =
  78. ## Decodes a URL according to RFC3986.
  79. ##
  80. ## This means that any `%xx` (where `xx` denotes a hexadecimal
  81. ## value) are converted to the character with ordinal number `xx`,
  82. ## and every other character is carried over.
  83. ## If `xx` is not a valid hexadecimal value, it is left intact.
  84. ##
  85. ## As a special rule, when the value of `decodePlus` is true, `+`
  86. ## characters are converted to a space.
  87. ##
  88. ## **See also:**
  89. ## * `encodeUrl func<#encodeUrl,string>`_
  90. runnableExamples:
  91. assert decodeUrl("https%3A%2F%2Fnim-lang.org") == "https://nim-lang.org"
  92. assert decodeUrl("https%3A%2F%2Fnim-lang.org%2Fthis+is+a+test") == "https://nim-lang.org/this is a test"
  93. assert decodeUrl("https%3A%2F%2Fnim-lang.org%2Fthis%20is%20a%20test",
  94. false) == "https://nim-lang.org/this is a test"
  95. assert decodeUrl("abc%xyz") == "abc%xyz"
  96. result = newString(s.len)
  97. var i = 0
  98. var j = 0
  99. while i < s.len:
  100. case s[i]
  101. of '%':
  102. result[j] = decodePercent(s, i)
  103. of '+':
  104. if decodePlus:
  105. result[j] = ' '
  106. else:
  107. result[j] = s[i]
  108. else: result[j] = s[i]
  109. inc(i)
  110. inc(j)
  111. setLen(result, j)
  112. func encodeQuery*(query: openArray[(string, string)], usePlus = true,
  113. omitEq = true, sep = '&'): string =
  114. ## Encodes a set of (key, value) parameters into a URL query string.
  115. ##
  116. ## Every (key, value) pair is URL-encoded and written as `key=value`. If the
  117. ## value is an empty string then the `=` is omitted, unless `omitEq` is
  118. ## false.
  119. ## The pairs are joined together by the `sep` character.
  120. ##
  121. ## The `usePlus` parameter is passed down to the `encodeUrl` function that
  122. ## is used for the URL encoding of the string values.
  123. ##
  124. ## **See also:**
  125. ## * `encodeUrl func<#encodeUrl,string>`_
  126. runnableExamples:
  127. assert encodeQuery({: }) == ""
  128. assert encodeQuery({"a": "1", "b": "2"}) == "a=1&b=2"
  129. assert encodeQuery({"a": "1", "b": ""}) == "a=1&b"
  130. assert encodeQuery({"a": "1", "b": ""}, omitEq = false, sep = ';') == "a=1;b="
  131. result = ""
  132. for elem in query:
  133. # Encode the `key = value` pairs and separate them with 'sep'
  134. if result.len > 0: result.add(sep)
  135. let (key, val) = elem
  136. result.add(encodeUrl(key, usePlus))
  137. # Omit the '=' if the value string is empty
  138. if not omitEq or val.len > 0:
  139. result.add('=')
  140. result.add(encodeUrl(val, usePlus))
  141. iterator decodeQuery*(data: string, sep = '&'): tuple[key, value: string] =
  142. ## Reads and decodes the query string `data` and yields the `(key, value)` pairs
  143. ## the data consists of. If compiled with `-d:nimLegacyParseQueryStrict`,
  144. ## a `UriParseError` is raised when there is an unencoded `=` character in a decoded
  145. ## value, which was the behavior in Nim < 1.5.1.
  146. runnableExamples:
  147. import std/sequtils
  148. assert toSeq(decodeQuery("foo=1&bar=2=3")) == @[("foo", "1"), ("bar", "2=3")]
  149. assert toSeq(decodeQuery("foo=1;bar=2=3", ';')) == @[("foo", "1"), ("bar", "2=3")]
  150. assert toSeq(decodeQuery("&a&=b&=&&")) == @[("", ""), ("a", ""), ("", "b"), ("", ""), ("", "")]
  151. proc parseData(data: string, i: int, field: var string, sep: char): int =
  152. result = i
  153. while result < data.len:
  154. let c = data[result]
  155. case c
  156. of '%': add(field, decodePercent(data, result))
  157. of '+': add(field, ' ')
  158. of '&': break
  159. else:
  160. if c == sep: break
  161. else: add(field, data[result])
  162. inc(result)
  163. var i = 0
  164. var name = ""
  165. var value = ""
  166. # decode everything in one pass:
  167. while i < data.len:
  168. setLen(name, 0) # reuse memory
  169. i = parseData(data, i, name, '=')
  170. setLen(value, 0) # reuse memory
  171. if i < data.len and data[i] == '=':
  172. inc(i) # skip '='
  173. when defined(nimLegacyParseQueryStrict):
  174. i = parseData(data, i, value, '=')
  175. else:
  176. i = parseData(data, i, value, sep)
  177. yield (name, value)
  178. if i < data.len:
  179. when defined(nimLegacyParseQueryStrict):
  180. if data[i] != '&':
  181. uriParseError("'&' expected at index '$#' for '$#'" % [$i, data])
  182. inc(i)
  183. func parseAuthority(authority: string, result: var Uri) =
  184. var i = 0
  185. var inPort = false
  186. var inIPv6 = false
  187. while i < authority.len:
  188. case authority[i]
  189. of '@':
  190. swap result.password, result.port
  191. result.port.setLen(0)
  192. swap result.username, result.hostname
  193. result.hostname.setLen(0)
  194. inPort = false
  195. of ':':
  196. if inIPv6:
  197. result.hostname.add(authority[i])
  198. else:
  199. inPort = true
  200. of '[':
  201. inIPv6 = true
  202. result.isIpv6 = true
  203. of ']':
  204. inIPv6 = false
  205. else:
  206. if inPort:
  207. result.port.add(authority[i])
  208. else:
  209. result.hostname.add(authority[i])
  210. i.inc
  211. func parsePath(uri: string, i: var int, result: var Uri) =
  212. i.inc parseUntil(uri, result.path, {'?', '#'}, i)
  213. # The 'mailto' scheme's PATH actually contains the hostname/username
  214. if cmpIgnoreCase(result.scheme, "mailto") == 0:
  215. parseAuthority(result.path, result)
  216. result.path.setLen(0)
  217. if i < uri.len and uri[i] == '?':
  218. i.inc # Skip '?'
  219. i.inc parseUntil(uri, result.query, {'#'}, i)
  220. if i < uri.len and uri[i] == '#':
  221. i.inc # Skip '#'
  222. i.inc parseUntil(uri, result.anchor, {}, i)
  223. func initUri*(isIpv6 = false): Uri =
  224. ## Initializes a URI with `scheme`, `username`, `password`,
  225. ## `hostname`, `port`, `path`, `query`, `anchor` and `isIpv6`.
  226. ##
  227. ## **See also:**
  228. ## * `Uri type <#Uri>`_ for available fields in the URI type
  229. runnableExamples:
  230. var uri2 = initUri(isIpv6 = true)
  231. uri2.scheme = "tcp"
  232. uri2.hostname = "2001:0db8:85a3:0000:0000:8a2e:0370:7334"
  233. uri2.port = "8080"
  234. assert $uri2 == "tcp://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]:8080"
  235. result = Uri(scheme: "", username: "", password: "", hostname: "", port: "",
  236. path: "", query: "", anchor: "", isIpv6: isIpv6)
  237. func resetUri(uri: var Uri) =
  238. for f in uri.fields:
  239. when f is string:
  240. f.setLen(0)
  241. else:
  242. f = false
  243. func parseUri*(uri: string, result: var Uri) =
  244. ## Parses a URI. The `result` variable will be cleared before.
  245. ##
  246. ## **See also:**
  247. ## * `Uri type <#Uri>`_ for available fields in the URI type
  248. ## * `initUri func <#initUri>`_ for initializing a URI
  249. runnableExamples:
  250. var res = initUri()
  251. parseUri("https://nim-lang.org/docs/manual.html", res)
  252. assert res.scheme == "https"
  253. assert res.hostname == "nim-lang.org"
  254. assert res.path == "/docs/manual.html"
  255. resetUri(result)
  256. var i = 0
  257. # Check if this is a reference URI (relative URI)
  258. let doubleSlash = uri.len > 1 and uri[0] == '/' and uri[1] == '/'
  259. if i < uri.len and uri[i] == '/':
  260. # Make sure `uri` doesn't begin with '//'.
  261. if not doubleSlash:
  262. parsePath(uri, i, result)
  263. return
  264. # Scheme
  265. i.inc parseWhile(uri, result.scheme, Letters + Digits + {'+', '-', '.'}, i)
  266. if (i >= uri.len or uri[i] != ':') and not doubleSlash:
  267. # Assume this is a reference URI (relative URI)
  268. i = 0
  269. result.scheme.setLen(0)
  270. parsePath(uri, i, result)
  271. return
  272. if not doubleSlash:
  273. i.inc # Skip ':'
  274. # Authority
  275. if i+1 < uri.len and uri[i] == '/' and uri[i+1] == '/':
  276. i.inc(2) # Skip //
  277. var authority = ""
  278. i.inc parseUntil(uri, authority, {'/', '?', '#'}, i)
  279. if authority.len > 0:
  280. parseAuthority(authority, result)
  281. else:
  282. result.opaque = true
  283. # Path
  284. parsePath(uri, i, result)
  285. func parseUri*(uri: string): Uri =
  286. ## Parses a URI and returns it.
  287. ##
  288. ## **See also:**
  289. ## * `Uri type <#Uri>`_ for available fields in the URI type
  290. runnableExamples:
  291. let res = parseUri("ftp://Username:Password@Hostname")
  292. assert res.username == "Username"
  293. assert res.password == "Password"
  294. assert res.scheme == "ftp"
  295. result = initUri()
  296. parseUri(uri, result)
  297. func removeDotSegments(path: string): string =
  298. ## Collapses `..` and `.` in `path` in a similar way as done in `os.normalizedPath`
  299. ## Caution: this is buggy.
  300. runnableExamples:
  301. assert removeDotSegments("a1/a2/../a3/a4/a5/./a6/a7/.//./") == "a1/a3/a4/a5/a6/a7/"
  302. assert removeDotSegments("http://www.ai.") == "http://www.ai."
  303. # xxx adapt or reuse `pathnorm.normalizePath(path, '/')` to make this more reliable, but
  304. # taking into account url specificities such as not collapsing leading `//` in scheme
  305. # `https://`. see `turi` for failing tests.
  306. if path.len == 0: return ""
  307. var collection: seq[string] = @[]
  308. let endsWithSlash = path.endsWith '/'
  309. var i = 0
  310. var currentSegment = ""
  311. while i < path.len:
  312. case path[i]
  313. of '/':
  314. collection.add(currentSegment)
  315. currentSegment = ""
  316. of '.':
  317. if i+2 < path.len and path[i+1] == '.' and path[i+2] == '/':
  318. if collection.len > 0:
  319. discard collection.pop()
  320. i.inc 3
  321. continue
  322. elif i + 1 < path.len and path[i+1] == '/':
  323. i.inc 2
  324. continue
  325. currentSegment.add path[i]
  326. else:
  327. currentSegment.add path[i]
  328. i.inc
  329. if currentSegment != "":
  330. collection.add currentSegment
  331. result = collection.join("/")
  332. if endsWithSlash: result.add '/'
  333. func merge(base, reference: Uri): string =
  334. # http://tools.ietf.org/html/rfc3986#section-5.2.3
  335. if base.hostname != "" and base.path == "":
  336. '/' & reference.path
  337. else:
  338. let lastSegment = rfind(base.path, "/")
  339. if lastSegment == -1:
  340. reference.path
  341. else:
  342. base.path[0 .. lastSegment] & reference.path
  343. func combine*(base: Uri, reference: Uri): Uri =
  344. ## Combines a base URI with a reference URI.
  345. ##
  346. ## This uses the algorithm specified in
  347. ## `section 5.2.2 of RFC 3986 <https://tools.ietf.org/html/rfc3986#section-5.2.2>`_.
  348. ##
  349. ## This means that the slashes inside the base URIs path as well as reference
  350. ## URIs path affect the resulting URI.
  351. ##
  352. ## **See also:**
  353. ## * `/ func <#/,Uri,string>`_ for building URIs
  354. runnableExamples:
  355. let foo = combine(parseUri("https://nim-lang.org/foo/bar"), parseUri("/baz"))
  356. assert foo.path == "/baz"
  357. let bar = combine(parseUri("https://nim-lang.org/foo/bar"), parseUri("baz"))
  358. assert bar.path == "/foo/baz"
  359. let qux = combine(parseUri("https://nim-lang.org/foo/bar/"), parseUri("baz"))
  360. assert qux.path == "/foo/bar/baz"
  361. template setAuthority(dest, src): untyped =
  362. dest.hostname = src.hostname
  363. dest.username = src.username
  364. dest.port = src.port
  365. dest.password = src.password
  366. result = initUri()
  367. if reference.scheme != base.scheme and reference.scheme != "":
  368. result = reference
  369. result.path = removeDotSegments(result.path)
  370. else:
  371. if reference.hostname != "":
  372. setAuthority(result, reference)
  373. result.path = removeDotSegments(reference.path)
  374. result.query = reference.query
  375. else:
  376. if reference.path == "":
  377. result.path = base.path
  378. if reference.query != "":
  379. result.query = reference.query
  380. else:
  381. result.query = base.query
  382. else:
  383. if reference.path.startsWith("/"):
  384. result.path = removeDotSegments(reference.path)
  385. else:
  386. result.path = removeDotSegments(merge(base, reference))
  387. result.query = reference.query
  388. setAuthority(result, base)
  389. result.scheme = base.scheme
  390. result.anchor = reference.anchor
  391. func combine*(uris: varargs[Uri]): Uri =
  392. ## Combines multiple URIs together.
  393. ##
  394. ## **See also:**
  395. ## * `/ func <#/,Uri,string>`_ for building URIs
  396. runnableExamples:
  397. let foo = combine(parseUri("https://nim-lang.org/"), parseUri("docs/"),
  398. parseUri("manual.html"))
  399. assert foo.hostname == "nim-lang.org"
  400. assert foo.path == "/docs/manual.html"
  401. result = uris[0]
  402. for i in 1 ..< uris.len:
  403. result = combine(result, uris[i])
  404. func isAbsolute*(uri: Uri): bool =
  405. ## Returns true if URI is absolute, false otherwise.
  406. runnableExamples:
  407. assert parseUri("https://nim-lang.org").isAbsolute
  408. assert not parseUri("nim-lang").isAbsolute
  409. return uri.scheme != "" and (uri.hostname != "" or uri.path != "")
  410. func `/`*(x: Uri, path: string): Uri =
  411. ## Concatenates the path specified to the specified URIs path.
  412. ##
  413. ## Contrary to the `combine func <#combine,Uri,Uri>`_ you do not have to worry about
  414. ## the slashes at the beginning and end of the path and URIs path
  415. ## respectively.
  416. ##
  417. ## **See also:**
  418. ## * `combine func <#combine,Uri,Uri>`_
  419. runnableExamples:
  420. let foo = parseUri("https://nim-lang.org/foo/bar") / "/baz"
  421. assert foo.path == "/foo/bar/baz"
  422. let bar = parseUri("https://nim-lang.org/foo/bar") / "baz"
  423. assert bar.path == "/foo/bar/baz"
  424. let qux = parseUri("https://nim-lang.org/foo/bar/") / "baz"
  425. assert qux.path == "/foo/bar/baz"
  426. result = x
  427. if result.path.len == 0:
  428. if path.len == 0 or path[0] != '/':
  429. result.path = "/"
  430. result.path.add(path)
  431. return
  432. if result.path.len > 0 and result.path[result.path.len-1] == '/':
  433. if path.len > 0 and path[0] == '/':
  434. result.path.add(path[1 .. path.len-1])
  435. else:
  436. result.path.add(path)
  437. else:
  438. if path.len == 0 or path[0] != '/':
  439. result.path.add '/'
  440. result.path.add(path)
  441. func `?`*(u: Uri, query: openArray[(string, string)]): Uri =
  442. ## Concatenates the query parameters to the specified URI object.
  443. runnableExamples:
  444. let foo = parseUri("https://example.com") / "foo" ? {"bar": "qux"}
  445. assert $foo == "https://example.com/foo?bar=qux"
  446. result = u
  447. result.query = encodeQuery(query)
  448. func `$`*(u: Uri): string =
  449. ## Returns the string representation of the specified URI object.
  450. runnableExamples:
  451. assert $parseUri("https://nim-lang.org") == "https://nim-lang.org"
  452. # Get the len of all the parts.
  453. let schemeLen = u.scheme.len
  454. let usernameLen = u.username.len
  455. let passwordLen = u.password.len
  456. let hostnameLen = u.hostname.len
  457. let portLen = u.port.len
  458. let pathLen = u.path.len
  459. let queryLen = u.query.len
  460. let anchorLen = u.anchor.len
  461. # Prepare a string that fits all the parts and all punctuation chars.
  462. # 12 is the max len required by all possible punctuation chars.
  463. result = newStringOfCap(
  464. schemeLen + usernameLen + passwordLen + hostnameLen + portLen + pathLen + queryLen + anchorLen + 12
  465. )
  466. # Insert to result.
  467. if schemeLen > 0:
  468. result.add u.scheme
  469. result.add ':'
  470. if not u.opaque:
  471. result.add '/'
  472. result.add '/'
  473. if usernameLen > 0:
  474. result.add u.username
  475. if passwordLen > 0:
  476. result.add ':'
  477. result.add u.password
  478. result.add '@'
  479. if u.hostname.endsWith('/'):
  480. if u.isIpv6:
  481. result.add '['
  482. result.add u.hostname[0 .. ^2]
  483. result.add ']'
  484. else:
  485. result.add u.hostname[0 .. ^2]
  486. else:
  487. if u.isIpv6:
  488. result.add '['
  489. result.add u.hostname
  490. result.add ']'
  491. else:
  492. result.add u.hostname
  493. if portLen > 0:
  494. result.add ':'
  495. result.add u.port
  496. if pathLen > 0:
  497. if hostnameLen > 0 and u.path[0] != '/':
  498. result.add '/'
  499. result.add u.path
  500. if queryLen > 0:
  501. result.add '?'
  502. result.add u.query
  503. if anchorLen > 0:
  504. result.add '#'
  505. result.add u.anchor
  506. proc getDataUri*(data, mime: string, encoding = "utf-8"): string {.since: (1, 3).} =
  507. ## Convenience proc for `base64.encode` returns a standard Base64 Data URI (RFC-2397)
  508. ##
  509. ## **See also:**
  510. ## * `mimetypes <mimetypes.html>`_ for `mime` argument
  511. ## * https://tools.ietf.org/html/rfc2397
  512. ## * https://en.wikipedia.org/wiki/Data_URI_scheme
  513. runnableExamples: static: assert getDataUri("Nim", "text/plain") == "data:text/plain;charset=utf-8;base64,Tmlt"
  514. assert encoding.len > 0 and mime.len > 0 # Must *not* be URL-Safe, see RFC-2397
  515. let base64encoded: string = base64.encode(data)
  516. # ("data:".len + ";charset=".len + ";base64,".len) == 22
  517. result = newStringOfCap(22 + mime.len + encoding.len + base64encoded.len)
  518. result.add "data:"
  519. result.add mime
  520. result.add ";charset="
  521. result.add encoding
  522. result.add ";base64,"
  523. result.add base64encoded