srfi-14.scm 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. ;;; SRFI-14: Character sets
  2. ;;; Copyright (C) 2024 David Thompson <dave@spritely.institute>
  3. ;;;
  4. ;;; Licensed under the Apache License, Version 2.0 (the "License");
  5. ;;; you may not use this file except in compliance with the License.
  6. ;;; You may obtain a copy of the License at
  7. ;;;
  8. ;;; http://www.apache.org/licenses/LICENSE-2.0
  9. ;;;
  10. ;;; Unless required by applicable law or agreed to in writing, software
  11. ;;; distributed under the License is distributed on an "AS IS" BASIS,
  12. ;;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. ;;; See the License for the specific language governing permissions and
  14. ;;; limitations under the License.
  15. ;;; Commentary:
  16. ;;;
  17. ;;; Character sets.
  18. ;;;
  19. ;;; Code:
  20. (define-module (srfi srfi-14)
  21. #:pure
  22. #:use-module ((hoot errors) #:select (check-type))
  23. #:use-module (hoot match)
  24. #:use-module ((hoot lists) #:select (fold sort))
  25. #:use-module ((hoot numbers) #:select (1+))
  26. #:use-module ((hoot syntax) #:select (case-lambda define*))
  27. #:use-module ((hoot vectors) #:select (vector-binary-search))
  28. #:use-module (scheme base)
  29. #:export (char-set
  30. char-set?
  31. char-set-contains?
  32. char-set-union
  33. char-set->list
  34. char-set->string
  35. list->char-set
  36. string->char-set
  37. char-set:lower-case
  38. char-set:upper-case
  39. char-set:title-case
  40. char-set:letter
  41. char-set:digit
  42. char-set:hex-digit
  43. char-set:letter+digit
  44. char-set:graphic
  45. char-set:printing
  46. char-set:whitespace
  47. char-set:iso-control
  48. char-set:punctuation
  49. char-set:symbol
  50. char-set:blank
  51. char-set:ascii
  52. char-set:empty
  53. char-set:full))
  54. ;; FIXME: This is a very poor and incomplete implementation of
  55. ;; character sets. This was written to support the bare minimum
  56. ;; needed to get Guile's (web uri) module to compile.
  57. ;;
  58. ;; What we really need is a port of Guile's srfi-14.c that uses
  59. ;; character ranges.
  60. (define-record-type <char-set>
  61. (make-char-set chars)
  62. char-set?
  63. (chars char-set-chars))
  64. (define empty-char-set (make-char-set #()))
  65. (define* (list->char-set chars #:optional base-cs)
  66. (let ((chars (if base-cs
  67. (append chars (char-set-chars base-cs))
  68. chars)))
  69. (make-char-set
  70. (list->vector
  71. (let lp ((chars (sort chars char<?)) (last #f))
  72. (match chars
  73. (() '())
  74. ((char . rest)
  75. (if (eqv? char last)
  76. (lp rest last)
  77. (cons char (lp rest char))))))))))
  78. (define (string->char-set str)
  79. (list->char-set (string->list str)))
  80. (define (range->char-set start end)
  81. (list->char-set
  82. (let lp ((i start))
  83. (if (< i end)
  84. (cons (integer->char i) (lp (1+ i)))
  85. '()))))
  86. (define (char-set->list cs)
  87. (vector->list (char-set-chars cs)))
  88. (define (char-set->string cs)
  89. (list->string (char-set->list cs)))
  90. (define (char-set . chars)
  91. (for-each (lambda (char)
  92. (check-type char char? 'char-set))
  93. chars)
  94. (list->char-set chars))
  95. (define char-set-union
  96. (case-lambda
  97. (() empty-char-set)
  98. ((char-set) char-set)
  99. (char-sets
  100. (list->char-set
  101. (fold (lambda (char-set chars)
  102. (append (vector->list (char-set-chars char-set)) chars))
  103. '() char-sets)))))
  104. (define (char-compare a b)
  105. (- (char->integer a) (char->integer b)))
  106. (define (char-set-contains? char-set char)
  107. (number? (vector-binary-search (char-set-chars char-set) char char-compare)))
  108. ;;;
  109. ;;; Built-in character sets
  110. ;;;
  111. ;; FIXME: ASCII ranges only for the moment.
  112. (define char-set:empty (char-set))
  113. (define char-set:lower-case
  114. (range->char-set (char->integer #\a) (1+ (char->integer #\z))))
  115. (define char-set:upper-case
  116. (range->char-set (char->integer #\A) (1+ (char->integer #\Z))))
  117. (define char-set:title-case char-set:empty)
  118. (define char-set:letter
  119. (char-set-union char-set:lower-case char-set:upper-case))
  120. (define char-set:digit (string->char-set "0123456789"))
  121. (define char-set:hex-digit (string->char-set "0123456789abcdefABCDEF"))
  122. (define char-set:letter+digit
  123. (char-set-union char-set:letter char-set:digit))
  124. (define char-set:punctuation (string->char-set "!\"#%&'()*,-./:;?@[\\]_{}"))
  125. (define char-set:symbol (string->char-set "$+<=>^`|~"))
  126. (define char-set:graphic
  127. (char-set-union char-set:letter+digit char-set:punctuation char-set:symbol))
  128. (define char-set:whitespace
  129. (list->char-set
  130. '(#\tab #\newline #\vtab #\page #\return #\space #\240)))
  131. (define char-set:printing
  132. (char-set-union char-set:whitespace char-set:graphic))
  133. (define char-set:iso-control
  134. (char-set-union (range->char-set 0 33) (char-set #\delete)))
  135. (define char-set:blank
  136. (list->char-set '(#\tab #\space #\240)))
  137. (define char-set:ascii (range->char-set 0 128))
  138. (define char-set:full char-set:ascii)