uri.scm 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546
  1. ;;;; (web uri) --- URI manipulation tools
  2. ;;;;
  3. ;;;; Copyright (C) 1997,2001,2002,2010,2011,2012,2013,2014,2019 Free Software Foundation, Inc.
  4. ;;;;
  5. ;;;; This library is free software; you can redistribute it and/or
  6. ;;;; modify it under the terms of the GNU Lesser General Public
  7. ;;;; License as published by the Free Software Foundation; either
  8. ;;;; version 3 of the License, or (at your option) any later version.
  9. ;;;;
  10. ;;;; This library is distributed in the hope that it will be useful,
  11. ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. ;;;; Lesser General Public License for more details.
  14. ;;;;
  15. ;;;; You should have received a copy of the GNU Lesser General Public
  16. ;;;; License along with this library; if not, write to the Free Software
  17. ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  18. ;;;;
  19. ;;; Commentary:
  20. ;; A data type for Universal Resource Identifiers, as defined in RFC
  21. ;; 3986.
  22. ;;; Code:
  23. (define-module (web uri)
  24. #:use-module (srfi srfi-9)
  25. #:use-module (ice-9 regex)
  26. #:use-module (ice-9 rdelim)
  27. #:use-module (ice-9 control)
  28. #:use-module (rnrs bytevectors)
  29. #:use-module (ice-9 binary-ports)
  30. #:export (uri?
  31. uri-scheme uri-userinfo uri-host uri-port
  32. uri-path uri-query uri-fragment
  33. build-uri
  34. build-uri-reference
  35. declare-default-port!
  36. string->uri string->uri-reference
  37. uri->string
  38. uri-decode uri-encode
  39. split-and-decode-uri-path
  40. encode-and-join-uri-path
  41. uri-reference? relative-ref?
  42. build-uri-reference build-relative-ref
  43. string->uri-reference string->relative-ref))
  44. (define-record-type <uri>
  45. (make-uri scheme userinfo host port path query fragment)
  46. uri-reference?
  47. (scheme uri-scheme)
  48. (userinfo uri-userinfo)
  49. (host uri-host)
  50. (port uri-port)
  51. (path uri-path)
  52. (query uri-query)
  53. (fragment uri-fragment))
  54. ;;;
  55. ;;; Predicates.
  56. ;;;
  57. ;;; These are quick, and assume rigid validation at construction time.
  58. ;;; RFC 3986, #3.
  59. ;;;
  60. ;;; URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
  61. ;;;
  62. ;;; hier-part = "//" authority path-abempty
  63. ;;; / path-absolute
  64. ;;; / path-rootless
  65. ;;; / path-empty
  66. (define (uri? obj)
  67. (and (uri-reference? obj)
  68. (uri-scheme obj)
  69. #t))
  70. ;;; RFC 3986, #4.2.
  71. ;;;
  72. ;;; relative-ref = relative-part [ "?" query ] [ "#" fragment ]
  73. ;;;
  74. ;;; relative-part = "//" authority path-abempty
  75. ;;; / path-absolute
  76. ;;; / path-noscheme
  77. ;;; / path-empty
  78. (define (relative-ref? obj)
  79. (and (uri-reference? obj)
  80. (not (uri-scheme obj))))
  81. ;;;
  82. ;;; Constructors.
  83. ;;;
  84. (define (uri-error message . args)
  85. (throw 'uri-error message args))
  86. (define (positive-exact-integer? port)
  87. (and (number? port) (exact? port) (integer? port) (positive? port)))
  88. (define (validate-uri-reference scheme userinfo host port path query fragment)
  89. (cond
  90. ((and scheme (not (symbol? scheme)))
  91. (uri-error "Expected a symbol for the URI scheme: ~s" scheme))
  92. ((and (or userinfo port) (not host))
  93. (uri-error "Expected a host, given userinfo or port"))
  94. ((and port (not (positive-exact-integer? port)))
  95. (uri-error "Expected port to be an integer: ~s" port))
  96. ((and host (or (not (string? host)) (not (valid-host? host))))
  97. (uri-error "Expected valid host: ~s" host))
  98. ((and userinfo (not (string? userinfo)))
  99. (uri-error "Expected string for userinfo: ~s" userinfo))
  100. ((not (string? path))
  101. (uri-error "Expected string for path: ~s" path))
  102. ((and query (not (string? query)))
  103. (uri-error "Expected string for query: ~s" query))
  104. ((and fragment (not (string? fragment)))
  105. (uri-error "Expected string for fragment: ~s" fragment))
  106. ;; Strict validation of allowed paths, based on other components.
  107. ;; Refer to RFC 3986 for the details.
  108. ((not (string-null? path))
  109. (if host
  110. (cond
  111. ((not (eqv? (string-ref path 0) #\/))
  112. (uri-error
  113. "Expected absolute path starting with \"/\": ~a" path)))
  114. (cond
  115. ((string-prefix? "//" path)
  116. (uri-error
  117. "Expected path not starting with \"//\" (no host): ~a" path))
  118. ((and (not scheme)
  119. (not (eqv? (string-ref path 0) #\/))
  120. (let ((colon (string-index path #\:)))
  121. (and colon (not (string-index path #\/ 0 colon)))))
  122. (uri-error
  123. "Expected relative path's first segment without \":\": ~a"
  124. path)))))))
  125. (define* (build-uri scheme #:key userinfo host port (path "") query fragment
  126. (validate? #t))
  127. "Construct a URI object. SCHEME should be a symbol, PORT either a
  128. positive, exact integer or ‘#f’, and the rest of the fields are either
  129. strings or ‘#f’. If VALIDATE? is true, also run some consistency checks
  130. to make sure that the constructed object is a valid URI."
  131. (when validate?
  132. (unless scheme (uri-error "Missing URI scheme"))
  133. (validate-uri-reference scheme userinfo host port path query fragment))
  134. (make-uri scheme userinfo host port path query fragment))
  135. (define* (build-uri-reference #:key scheme userinfo host port (path "") query
  136. fragment (validate? #t))
  137. "Construct a URI-reference object. SCHEME should be a symbol or ‘#f’,
  138. PORT either a positive, exact integer or ‘#f’, and the rest of the
  139. fields are either strings or ‘#f’. If VALIDATE? is true, also run some
  140. consistency checks to make sure that the constructed URI is a valid URI
  141. reference."
  142. (when validate?
  143. (validate-uri-reference scheme userinfo host port path query fragment))
  144. (make-uri scheme userinfo host port path query fragment))
  145. (define* (build-relative-ref #:key userinfo host port (path "") query fragment
  146. (validate? #t))
  147. "Construct a relative-ref URI object. The arguments are the same as
  148. for ‘build-uri’ except there is no scheme."
  149. (when validate?
  150. (validate-uri-reference #f userinfo host port path query fragment))
  151. (make-uri #f userinfo host port path query fragment))
  152. ;;;
  153. ;;; Converters.
  154. ;;;
  155. ;; Since character ranges in regular expressions may depend on the
  156. ;; current locale, we use explicit lists of characters instead. See
  157. ;; <https://bugs.gnu.org/35785> for details.
  158. (define digits "0123456789")
  159. (define hex-digits "0123456789ABCDEFabcdef")
  160. (define letters "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")
  161. ;; See RFC 3986 #3.2.2 for comments on percent-encodings, IDNA (RFC
  162. ;; 3490), and non-ASCII host names.
  163. ;;
  164. (define ipv4-regexp
  165. (make-regexp (string-append "^([" digits ".]+)$")))
  166. (define ipv6-regexp
  167. (make-regexp (string-append "^([" hex-digits ":.]+)$")))
  168. (define domain-label-regexp
  169. (make-regexp
  170. (string-append "^[" letters digits "]"
  171. "([" letters digits "-]*[" letters digits "])?$")))
  172. (define top-label-regexp
  173. (make-regexp
  174. (string-append "^[" letters "]"
  175. "([" letters digits "-]*[" letters digits "])?$")))
  176. (define (valid-host? host)
  177. (cond
  178. ((regexp-exec ipv4-regexp host)
  179. (false-if-exception (inet-pton AF_INET host)))
  180. ((regexp-exec ipv6-regexp host)
  181. (false-if-exception (inet-pton AF_INET6 host)))
  182. (else
  183. (let lp ((start 0))
  184. (let ((end (string-index host #\. start)))
  185. (if end
  186. (and (regexp-exec domain-label-regexp
  187. (substring host start end))
  188. (lp (1+ end)))
  189. (regexp-exec top-label-regexp host start)))))))
  190. (define userinfo-pat
  191. (string-append "[" letters digits "_.!~*'();:&=+$,-]+"))
  192. (define host-pat
  193. (string-append "[" letters digits ".-]+"))
  194. (define ipv6-host-pat
  195. (string-append "[" hex-digits ":.]+"))
  196. (define port-pat
  197. (string-append "[" digits "]*"))
  198. (define authority-regexp
  199. (make-regexp
  200. (format #f "^//((~a)@)?((~a)|(\\[(~a)\\]))(:(~a))?$"
  201. userinfo-pat host-pat ipv6-host-pat port-pat)))
  202. (define (parse-authority authority fail)
  203. (if (equal? authority "//")
  204. ;; Allow empty authorities: file:///etc/hosts is a synonym of
  205. ;; file:/etc/hosts.
  206. (values #f #f #f)
  207. (let ((m (regexp-exec authority-regexp authority)))
  208. (if (and m (valid-host? (or (match:substring m 4)
  209. (match:substring m 6))))
  210. (values (match:substring m 2)
  211. (or (match:substring m 4)
  212. (match:substring m 6))
  213. (let ((port (match:substring m 8)))
  214. (and port (not (string-null? port))
  215. (string->number port))))
  216. (fail)))))
  217. ;;; RFC 3986, #3.
  218. ;;;
  219. ;;; URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
  220. ;;;
  221. ;;; hier-part = "//" authority path-abempty
  222. ;;; / path-absolute
  223. ;;; / path-rootless
  224. ;;; / path-empty
  225. ;;;
  226. ;;; A URI-reference is the same as URI, but where the scheme is
  227. ;;; optional. If the scheme is not present, its colon isn't present
  228. ;;; either.
  229. (define scheme-pat
  230. (string-append "[" letters "][" letters digits "+.-]*"))
  231. (define authority-pat
  232. "[^/?#]*")
  233. (define path-pat
  234. "[^?#]*")
  235. (define query-pat
  236. "[^#]*")
  237. (define fragment-pat
  238. ".*")
  239. (define uri-pat
  240. (format #f "^((~a):)?(//~a)?(~a)(\\?(~a))?(#(~a))?$"
  241. scheme-pat authority-pat path-pat query-pat fragment-pat))
  242. (define uri-regexp
  243. (make-regexp uri-pat))
  244. (define (string->uri-reference string)
  245. "Parse STRING into a URI-reference object. Return ‘#f’ if the string
  246. could not be parsed."
  247. (% (let ((m (regexp-exec uri-regexp string)))
  248. (unless m (abort))
  249. (let ((scheme (let ((str (match:substring m 2)))
  250. (and str (string->symbol (string-downcase str)))))
  251. (authority (match:substring m 3))
  252. (path (match:substring m 4))
  253. (query (match:substring m 6))
  254. (fragment (match:substring m 8)))
  255. ;; The regular expression already ensures all of the validation
  256. ;; requirements for URI-references, except the one that the
  257. ;; first component of a relative-ref's path can't contain a
  258. ;; colon.
  259. (unless scheme
  260. (let ((colon (string-index path #\:)))
  261. (when (and colon (not (string-index path #\/ 0 colon)))
  262. (abort))))
  263. (call-with-values
  264. (lambda ()
  265. (if authority
  266. (parse-authority authority abort)
  267. (values #f #f #f)))
  268. (lambda (userinfo host port)
  269. (make-uri scheme userinfo host port path query fragment)))))
  270. (lambda (k)
  271. #f)))
  272. (define (string->uri string)
  273. "Parse STRING into a URI object. Return ‘#f’ if the string could not
  274. be parsed. Note that this procedure will require that the URI have a
  275. scheme."
  276. (let ((uri-reference (string->uri-reference string)))
  277. (and (not (relative-ref? uri-reference))
  278. uri-reference)))
  279. (define (string->relative-ref string)
  280. "Parse STRING into a relative-ref URI object. Return ‘#f’ if the
  281. string could not be parsed."
  282. (let ((uri-reference (string->uri-reference string)))
  283. (and (relative-ref? uri-reference)
  284. uri-reference)))
  285. (define *default-ports* (make-hash-table))
  286. (define (declare-default-port! scheme port)
  287. "Declare a default port for the given URI scheme."
  288. (hashq-set! *default-ports* scheme port))
  289. (define (default-port? scheme port)
  290. (or (not port)
  291. (eqv? port (hashq-ref *default-ports* scheme))))
  292. (declare-default-port! 'http 80)
  293. (declare-default-port! 'https 443)
  294. (define* (uri->string uri #:key (include-fragment? #t))
  295. "Serialize URI to a string. If the URI has a port that is the
  296. default port for its scheme, the port is not included in the
  297. serialization."
  298. (let* ((scheme (uri-scheme uri))
  299. (userinfo (uri-userinfo uri))
  300. (host (uri-host uri))
  301. (port (uri-port uri))
  302. (path (uri-path uri))
  303. (query (uri-query uri))
  304. (fragment (uri-fragment uri)))
  305. (string-append
  306. (if scheme
  307. (string-append (symbol->string scheme) ":")
  308. "")
  309. (if host
  310. (string-append "//"
  311. (if userinfo (string-append userinfo "@")
  312. "")
  313. (if (string-index host #\:)
  314. (string-append "[" host "]")
  315. host)
  316. (if (default-port? (uri-scheme uri) port)
  317. ""
  318. (string-append ":" (number->string port))))
  319. "")
  320. path
  321. (if query
  322. (string-append "?" query)
  323. "")
  324. (if (and fragment include-fragment?)
  325. (string-append "#" fragment)
  326. ""))))
  327. ;; like call-with-output-string, but actually closes the port (doh)
  328. (define (call-with-output-string* proc)
  329. (let ((port (open-output-string)))
  330. (proc port)
  331. (let ((str (get-output-string port)))
  332. (close-port port)
  333. str)))
  334. (define (call-with-output-bytevector* proc)
  335. (call-with-values
  336. (lambda ()
  337. (open-bytevector-output-port))
  338. (lambda (port get-bytevector)
  339. (proc port)
  340. (let ((bv (get-bytevector)))
  341. (close-port port)
  342. bv))))
  343. (define (call-with-encoded-output-string encoding proc)
  344. (if (string-ci=? encoding "utf-8")
  345. (string->utf8 (call-with-output-string* proc))
  346. (call-with-output-bytevector*
  347. (lambda (port)
  348. (set-port-encoding! port encoding)
  349. (proc port)))))
  350. (define (encode-string str encoding)
  351. (if (string-ci=? encoding "utf-8")
  352. (string->utf8 str)
  353. (call-with-encoded-output-string encoding
  354. (lambda (port)
  355. (display str port)))))
  356. (define (decode-string bv encoding)
  357. (if (string-ci=? encoding "utf-8")
  358. (utf8->string bv)
  359. (let ((p (open-bytevector-input-port bv)))
  360. (set-port-encoding! p encoding)
  361. (let ((res (read-string p)))
  362. (close-port p)
  363. res))))
  364. ;; A note on characters and bytes: URIs are defined to be sequences of
  365. ;; characters in a subset of ASCII. Those characters may encode a
  366. ;; sequence of bytes (octets), which in turn may encode sequences of
  367. ;; characters in other character sets.
  368. ;;
  369. ;; Return a new string made from uri-decoding STR. Specifically,
  370. ;; turn ‘+’ into space, and hex-encoded ‘%XX’ strings into
  371. ;; their eight-bit characters.
  372. ;;
  373. (define hex-chars
  374. (string->char-set "0123456789abcdefABCDEF"))
  375. (define* (uri-decode str #:key (encoding "utf-8") (decode-plus-to-space? #t))
  376. "Percent-decode the given STR, according to ENCODING,
  377. which should be the name of a character encoding.
  378. Note that this function should not generally be applied to a full URI
  379. string. For paths, use ‘split-and-decode-uri-path’ instead. For query
  380. strings, split the query on ‘&’ and ‘=’ boundaries, and decode
  381. the components separately.
  382. Note also that percent-encoded strings encode _bytes_, not characters.
  383. There is no guarantee that a given byte sequence is a valid string
  384. encoding. Therefore this routine may signal an error if the decoded
  385. bytes are not valid for the given encoding. Pass ‘#f’ for ENCODING if
  386. you want decoded bytes as a bytevector directly. ‘set-port-encoding!’,
  387. for more information on character encodings.
  388. If DECODE-PLUS-TO-SPACE? is true, which is the default, also replace
  389. instances of the plus character (+) with a space character. This is
  390. needed when parsing application/x-www-form-urlencoded data.
  391. Returns a string of the decoded characters, or a bytevector if
  392. ENCODING was ‘#f’."
  393. (let* ((len (string-length str))
  394. (bv
  395. (call-with-output-bytevector*
  396. (lambda (port)
  397. (let lp ((i 0))
  398. (if (< i len)
  399. (let ((ch (string-ref str i)))
  400. (cond
  401. ((and (eqv? ch #\+) decode-plus-to-space?)
  402. (put-u8 port (char->integer #\space))
  403. (lp (1+ i)))
  404. ((and (< (+ i 2) len) (eqv? ch #\%)
  405. (let ((a (string-ref str (+ i 1)))
  406. (b (string-ref str (+ i 2))))
  407. (and (char-set-contains? hex-chars a)
  408. (char-set-contains? hex-chars b)
  409. (string->number (string a b) 16))))
  410. => (lambda (u8)
  411. (put-u8 port u8)
  412. (lp (+ i 3))))
  413. ((< (char->integer ch) 128)
  414. (put-u8 port (char->integer ch))
  415. (lp (1+ i)))
  416. (else
  417. (uri-error "Invalid character in encoded URI ~a: ~s"
  418. str ch))))))))))
  419. (if encoding
  420. (decode-string bv encoding)
  421. ;; Otherwise return raw bytevector
  422. bv)))
  423. (define ascii-alnum-chars
  424. (string->char-set
  425. "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"))
  426. ;; RFC 3986, #2.2.
  427. (define gen-delims
  428. (string->char-set ":/?#[]@"))
  429. (define sub-delims
  430. (string->char-set "!$&'()*+,l="))
  431. (define reserved-chars
  432. (char-set-union gen-delims sub-delims))
  433. ;; RFC 3986, #2.3
  434. (define unreserved-chars
  435. (char-set-union ascii-alnum-chars
  436. (string->char-set "-._~")))
  437. ;; Return a new string made from uri-encoding STR, unconditionally
  438. ;; transforming any characters not in UNESCAPED-CHARS.
  439. ;;
  440. (define* (uri-encode str #:key (encoding "utf-8")
  441. (unescaped-chars unreserved-chars))
  442. "Percent-encode any character not in the character set,
  443. UNESCAPED-CHARS.
  444. The default character set includes alphanumerics from ASCII, as well as
  445. the special characters ‘-’, ‘.’, ‘_’, and ‘~’. Any other character will
  446. be percent-encoded, by writing out the character to a bytevector within
  447. the given ENCODING, then encoding each byte as ‘%HH’, where HH is the
  448. uppercase hexadecimal representation of the byte."
  449. (define (needs-escaped? ch)
  450. (not (char-set-contains? unescaped-chars ch)))
  451. (if (string-index str needs-escaped?)
  452. (call-with-output-string*
  453. (lambda (port)
  454. (string-for-each
  455. (lambda (ch)
  456. (if (char-set-contains? unescaped-chars ch)
  457. (display ch port)
  458. (let* ((bv (encode-string (string ch) encoding))
  459. (len (bytevector-length bv)))
  460. (let lp ((i 0))
  461. (if (< i len)
  462. (let ((byte (bytevector-u8-ref bv i)))
  463. (display #\% port)
  464. (when (< byte 16)
  465. (display #\0 port))
  466. (display (string-upcase (number->string byte 16))
  467. port)
  468. (lp (1+ i))))))))
  469. str)))
  470. str))
  471. (define (split-and-decode-uri-path path)
  472. "Split PATH into its components, and decode each component,
  473. removing empty components.
  474. For example, ‘\"/foo/bar%20baz/\"’ decodes to the two-element list,
  475. ‘(\"foo\" \"bar baz\")’."
  476. (filter (lambda (x) (not (string-null? x)))
  477. (map (lambda (s) (uri-decode s #:decode-plus-to-space? #f))
  478. (string-split path #\/))))
  479. (define (encode-and-join-uri-path parts)
  480. "URI-encode each element of PARTS, which should be a list of
  481. strings, and join the parts together with ‘/’ as a delimiter.
  482. For example, the list ‘(\"scrambled eggs\" \"biscuits&gravy\")’
  483. encodes as ‘\"scrambled%20eggs/biscuits%26gravy\"’."
  484. (string-join (map uri-encode parts) "/"))