uri.scm 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
  1. ;;;; (web uri) --- URI manipulation tools
  2. ;;;;
  3. ;;;; Copyright (C) 1997,2001,2002,2010,2011 Free Software Foundation, Inc.
  4. ;;;;
  5. ;;;; This library is free software; you can redistribute it and/or
  6. ;;;; modify it under the terms of the GNU Lesser General Public
  7. ;;;; License as published by the Free Software Foundation; either
  8. ;;;; version 3 of the License, or (at your option) any later version.
  9. ;;;;
  10. ;;;; This library is distributed in the hope that it will be useful,
  11. ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. ;;;; Lesser General Public License for more details.
  14. ;;;;
  15. ;;;; You should have received a copy of the GNU Lesser General Public
  16. ;;;; License along with this library; if not, write to the Free Software
  17. ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  18. ;;;;
  19. ;;; Commentary:
  20. ;; A data type for Universal Resource Identifiers, as defined in RFC
  21. ;; 3986.
  22. ;;; Code:
  23. (define-module (web uri)
  24. #:use-module (srfi srfi-9)
  25. #:use-module (ice-9 regex)
  26. #:use-module (ice-9 rdelim)
  27. #:use-module (ice-9 control)
  28. #:use-module (rnrs bytevectors)
  29. #:use-module (ice-9 binary-ports)
  30. #:export (uri?
  31. uri-scheme uri-userinfo uri-host uri-port
  32. uri-path uri-query uri-fragment
  33. build-uri
  34. declare-default-port!
  35. string->uri uri->string
  36. uri-decode uri-encode
  37. split-and-decode-uri-path
  38. encode-and-join-uri-path))
  39. (define-record-type <uri>
  40. (make-uri scheme userinfo host port path query fragment)
  41. uri?
  42. (scheme uri-scheme)
  43. (userinfo uri-userinfo)
  44. (host uri-host)
  45. (port uri-port)
  46. (path uri-path)
  47. (query uri-query)
  48. (fragment uri-fragment))
  49. (define (uri-error message . args)
  50. (throw 'uri-error message args))
  51. (define (positive-exact-integer? port)
  52. (and (number? port) (exact? port) (integer? port) (positive? port)))
  53. (define (validate-uri scheme userinfo host port path query fragment)
  54. (cond
  55. ((not (symbol? scheme))
  56. (uri-error "Expected a symbol for the URI scheme: ~s" scheme))
  57. ((and (or userinfo port) (not host))
  58. (uri-error "Expected a host, given userinfo or port"))
  59. ((and port (not (positive-exact-integer? port)))
  60. (uri-error "Expected port to be an integer: ~s" port))
  61. ((and host (or (not (string? host)) (not (valid-host? host))))
  62. (uri-error "Expected valid host: ~s" host))
  63. ((and userinfo (not (string? userinfo)))
  64. (uri-error "Expected string for userinfo: ~s" userinfo))
  65. ((not (string? path))
  66. (uri-error "Expected string for path: ~s" path))
  67. ((and host (not (string-null? path))
  68. (not (eqv? (string-ref path 0) #\/)))
  69. (uri-error "Expected path of absolute URI to start with a /: ~a" path))))
  70. (define* (build-uri scheme #:key userinfo host port (path "") query fragment
  71. (validate? #t))
  72. "Construct a URI object. If @var{validate?} is true, also run some
  73. consistency checks to make sure that the constructed URI is valid."
  74. (if validate?
  75. (validate-uri scheme userinfo host port path query fragment))
  76. (make-uri scheme userinfo host port path query fragment))
  77. ;; See RFC 3986 #3.2.2 for comments on percent-encodings, IDNA (RFC
  78. ;; 3490), and non-ASCII host names.
  79. ;;
  80. (define ipv4-regexp
  81. (make-regexp "^([0-9.]+)"))
  82. (define ipv6-regexp
  83. (make-regexp "^\\[([0-9a-fA-F:]+)\\]+"))
  84. (define domain-label-regexp
  85. (make-regexp "^[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?$"))
  86. (define top-label-regexp
  87. (make-regexp "^[a-zA-Z]([a-zA-Z0-9-]*[a-zA-Z0-9])?$"))
  88. (define (valid-host? host)
  89. (cond
  90. ((regexp-exec ipv4-regexp host)
  91. => (lambda (m)
  92. (false-if-exception (inet-pton AF_INET (match:substring m 1)))))
  93. ((regexp-exec ipv6-regexp host)
  94. => (lambda (m)
  95. (false-if-exception (inet-pton AF_INET6 (match:substring m 1)))))
  96. (else
  97. (let ((labels (reverse (string-split host #\.))))
  98. (and (pair? labels)
  99. (regexp-exec top-label-regexp (car labels))
  100. (and-map (lambda (label)
  101. (regexp-exec domain-label-regexp label))
  102. (cdr labels)))))))
  103. (define userinfo-pat
  104. "[a-zA-Z0-9_.!~*'();:&=+$,-]+")
  105. (define host-pat
  106. "[a-zA-Z0-9.-]+")
  107. (define port-pat
  108. "[0-9]*")
  109. (define authority-regexp
  110. (make-regexp
  111. (format #f "^//((~a)@)?(~a)(:(~a))?$"
  112. userinfo-pat host-pat port-pat)))
  113. (define (parse-authority authority fail)
  114. (let ((m (regexp-exec authority-regexp authority)))
  115. (if (and m (valid-host? (match:substring m 3)))
  116. (values (match:substring m 2)
  117. (match:substring m 3)
  118. (let ((port (match:substring m 5)))
  119. (and port (not (string-null? port))
  120. (string->number port))))
  121. (fail))))
  122. ;;; RFC 3986, #3.
  123. ;;;
  124. ;;; URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
  125. ;;;
  126. ;;; hier-part = "//" authority path-abempty
  127. ;;; / path-absolute
  128. ;;; / path-rootless
  129. ;;; / path-empty
  130. (define scheme-pat
  131. "[a-zA-Z][a-zA-Z0-9+.-]*")
  132. (define authority-pat
  133. "[^/?#]*")
  134. (define path-pat
  135. "[^?#]*")
  136. (define query-pat
  137. "[^#]*")
  138. (define fragment-pat
  139. ".*")
  140. (define uri-pat
  141. (format #f "^(~a):(//~a)?(~a)(\\?(~a))?(#(~a))?$"
  142. scheme-pat authority-pat path-pat query-pat fragment-pat))
  143. (define uri-regexp
  144. (make-regexp uri-pat))
  145. (define (string->uri string)
  146. "Parse @var{string} into a URI object. Returns @code{#f} if the string
  147. could not be parsed."
  148. (% (let ((m (regexp-exec uri-regexp string)))
  149. (if (not m) (abort))
  150. (let ((scheme (string->symbol
  151. (string-downcase (match:substring m 1))))
  152. (authority (match:substring m 2))
  153. (path (match:substring m 3))
  154. (query (match:substring m 5))
  155. (fragment (match:substring m 7)))
  156. (call-with-values
  157. (lambda ()
  158. (if authority
  159. (parse-authority authority abort)
  160. (values #f #f #f)))
  161. (lambda (userinfo host port)
  162. (make-uri scheme userinfo host port path query fragment)))))
  163. (lambda (k)
  164. #f)))
  165. (define *default-ports* (make-hash-table))
  166. (define (declare-default-port! scheme port)
  167. "Declare a default port for the given URI scheme.
  168. Default ports are for printing URI objects: a default port is not
  169. printed."
  170. (hashq-set! *default-ports* scheme port))
  171. (define (default-port? scheme port)
  172. (or (not port)
  173. (eqv? port (hashq-ref *default-ports* scheme))))
  174. (declare-default-port! 'http 80)
  175. (declare-default-port! 'https 443)
  176. (define (uri->string uri)
  177. "Serialize @var{uri} to a string."
  178. (let* ((scheme-str (string-append
  179. (symbol->string (uri-scheme uri)) ":"))
  180. (userinfo (uri-userinfo uri))
  181. (host (uri-host uri))
  182. (port (uri-port uri))
  183. (path (uri-path uri))
  184. (query (uri-query uri))
  185. (fragment (uri-fragment uri)))
  186. (string-append
  187. scheme-str
  188. (if host
  189. (string-append "//"
  190. (if userinfo (string-append userinfo "@")
  191. "")
  192. host
  193. (if (default-port? (uri-scheme uri) port)
  194. ""
  195. (string-append ":" (number->string port))))
  196. "")
  197. path
  198. (if query
  199. (string-append "?" query)
  200. "")
  201. (if fragment
  202. (string-append "#" fragment)
  203. ""))))
  204. ;; like call-with-output-string, but actually closes the port (doh)
  205. (define (call-with-output-string* proc)
  206. (let ((port (open-output-string)))
  207. (proc port)
  208. (let ((str (get-output-string port)))
  209. (close-port port)
  210. str)))
  211. (define (call-with-output-bytevector* proc)
  212. (call-with-values
  213. (lambda ()
  214. (open-bytevector-output-port))
  215. (lambda (port get-bytevector)
  216. (proc port)
  217. (let ((bv (get-bytevector)))
  218. (close-port port)
  219. bv))))
  220. (define (call-with-encoded-output-string encoding proc)
  221. (if (string-ci=? encoding "utf-8")
  222. (string->utf8 (call-with-output-string* proc))
  223. (call-with-output-bytevector*
  224. (lambda (port)
  225. (set-port-encoding! port encoding)
  226. (proc port)))))
  227. (define (encode-string str encoding)
  228. (if (string-ci=? encoding "utf-8")
  229. (string->utf8 str)
  230. (call-with-encoded-output-string encoding
  231. (lambda (port)
  232. (display str port)))))
  233. (define (decode-string bv encoding)
  234. (if (string-ci=? encoding "utf-8")
  235. (utf8->string bv)
  236. (let ((p (open-bytevector-input-port bv)))
  237. (set-port-encoding! p encoding)
  238. (let ((res (read-delimited "" p)))
  239. (close-port p)
  240. res))))
  241. ;; A note on characters and bytes: URIs are defined to be sequences of
  242. ;; characters in a subset of ASCII. Those characters may encode a
  243. ;; sequence of bytes (octets), which in turn may encode sequences of
  244. ;; characters in other character sets.
  245. ;;
  246. ;; Return a new string made from uri-decoding @var{str}. Specifically,
  247. ;; turn @code{+} into space, and hex-encoded @code{%XX} strings into
  248. ;; their eight-bit characters.
  249. ;;
  250. (define hex-chars
  251. (string->char-set "0123456789abcdefABCDEF"))
  252. (define* (uri-decode str #:key (encoding "utf-8"))
  253. "Percent-decode the given @var{str}, according to @var{encoding}.
  254. Note that this function should not generally be applied to a full URI
  255. string. For paths, use split-and-decode-uri-path instead. For query
  256. strings, split the query on @code{&} and @code{=} boundaries, and decode
  257. the components separately.
  258. Note that percent-encoded strings encode @emph{bytes}, not characters.
  259. There is no guarantee that a given byte sequence is a valid string
  260. encoding. Therefore this routine may signal an error if the decoded
  261. bytes are not valid for the given encoding. Pass @code{#f} for
  262. @var{encoding} if you want decoded bytes as a bytevector directly."
  263. (let* ((len (string-length str))
  264. (bv
  265. (call-with-output-bytevector*
  266. (lambda (port)
  267. (let lp ((i 0))
  268. (if (< i len)
  269. (let ((ch (string-ref str i)))
  270. (cond
  271. ((eqv? ch #\+)
  272. (put-u8 port (char->integer #\space))
  273. (lp (1+ i)))
  274. ((and (< (+ i 2) len) (eqv? ch #\%)
  275. (let ((a (string-ref str (+ i 1)))
  276. (b (string-ref str (+ i 2))))
  277. (and (char-set-contains? hex-chars a)
  278. (char-set-contains? hex-chars b)
  279. (string->number (string a b) 16))))
  280. => (lambda (u8)
  281. (put-u8 port u8)
  282. (lp (+ i 3))))
  283. ((< (char->integer ch) 128)
  284. (put-u8 port (char->integer ch))
  285. (lp (1+ i)))
  286. (else
  287. (uri-error "Invalid character in encoded URI ~a: ~s"
  288. str ch))))))))))
  289. (if encoding
  290. (decode-string bv encoding)
  291. ;; Otherwise return raw bytevector
  292. bv)))
  293. (define ascii-alnum-chars
  294. (string->char-set
  295. "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"))
  296. ;; RFC 3986, #2.2.
  297. (define gen-delims
  298. (string->char-set ":/?#[]@"))
  299. (define sub-delims
  300. (string->char-set "!$&'()*+,l="))
  301. (define reserved-chars
  302. (char-set-union gen-delims sub-delims))
  303. ;; RFC 3986, #2.3
  304. (define unreserved-chars
  305. (char-set-union ascii-alnum-chars
  306. (string->char-set "-._~")))
  307. ;; Return a new string made from uri-encoding @var{str}, unconditionally
  308. ;; transforming any characters not in @var{unescaped-chars}.
  309. ;;
  310. (define* (uri-encode str #:key (encoding "utf-8")
  311. (unescaped-chars unreserved-chars))
  312. "Percent-encode any character not in the character set, @var{unescaped-chars}.
  313. Percent-encoding first writes out the given character to a bytevector
  314. within the given @var{encoding}, then encodes each byte as
  315. @code{%@var{HH}}, where @var{HH} is the hexadecimal representation of
  316. the byte."
  317. (if (string-index str unescaped-chars)
  318. (call-with-output-string*
  319. (lambda (port)
  320. (string-for-each
  321. (lambda (ch)
  322. (if (char-set-contains? unescaped-chars ch)
  323. (display ch port)
  324. (let* ((bv (encode-string (string ch) encoding))
  325. (len (bytevector-length bv)))
  326. (let lp ((i 0))
  327. (if (< i len)
  328. (let ((byte (bytevector-u8-ref bv i)))
  329. (display #\% port)
  330. (display (number->string byte 16) port)
  331. (lp (1+ i))))))))
  332. str)))
  333. str))
  334. (define (split-and-decode-uri-path path)
  335. "Split @var{path} into its components, and decode each
  336. component, removing empty components.
  337. For example, @code{\"/foo/bar/\"} decodes to the two-element list,
  338. @code{(\"foo\" \"bar\")}."
  339. (filter (lambda (x) (not (string-null? x)))
  340. (map uri-decode (string-split path #\/))))
  341. (define (encode-and-join-uri-path parts)
  342. "URI-encode each element of @var{parts}, which should be a list of
  343. strings, and join the parts together with @code{/} as a delimiter."
  344. (string-join (map uri-encode parts) "/"))