unicode-normalization.scm 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. ; Part of Scheme 48 1.9. See file COPYING for notices and license.
  2. ; Authors: Mike Sperber
  3. ; returns index of value (must be number) in vector
  4. (define (binary-search vec val)
  5. (let ((size (vector-length vec)))
  6. (let loop ((low 0) ; inclusive
  7. (high size)) ; exclusive
  8. (cond
  9. ((< low (- high 1))
  10. (let* ((pos (quotient (+ low high) 2)) ; always in
  11. (at (vector-ref vec pos)))
  12. (cond
  13. ((= val at) pos)
  14. ((< val at)
  15. (loop low pos))
  16. (else
  17. (loop pos high)))))
  18. ((< low high)
  19. (if (= val (vector-ref vec low))
  20. low
  21. #f))
  22. (else #f)))))
  23. (define *normalization-info-block-mask*
  24. (- (arithmetic-shift 1 *normalization-info-block-bits*) 1))
  25. (define (sv-normalization-info s)
  26. (vector-ref *normalization-info-encodings*
  27. (+ (vector-ref *normalization-info-indices*
  28. (arithmetic-shift s (- *normalization-info-block-bits*)))
  29. (bitwise-and s
  30. *normalization-info-block-mask*))))
  31. (define (sv-combining-class s)
  32. (bitwise-and (sv-normalization-info s) #xff))
  33. (define (sv-has-canonical-decomposition? s)
  34. (not (zero? (bitwise-and (sv-normalization-info s) #x100))))
  35. (define (sv-has-compatibility-decomposition? s)
  36. (not (zero? (bitwise-and (sv-normalization-info s) #x200))))
  37. ;; Hangul constants
  38. ;; from Unicode Standard Annex #15
  39. (define jamo-syllable-start #xAC00)
  40. (define jamo-initial-consonant-start #x1100)
  41. (define jamo-initial-consonant-count 19)
  42. (define jamo-initial-consonant-end (+ jamo-initial-consonant-start jamo-initial-consonant-count))
  43. (define jamo-trailing-consonant-start #x11A7)
  44. (define jamo-trailing-consonant-count 28)
  45. (define jamo-trailing-consonant-end (+ jamo-trailing-consonant-start jamo-trailing-consonant-count))
  46. (define jamo-vowel-start #x1161)
  47. (define jamo-vowel-count 21)
  48. (define jamo-vowel-end (+ jamo-vowel-start jamo-vowel-count))
  49. ;; number of syllables with a given initial consonant
  50. (define jamo-syllable-per-count
  51. (* jamo-vowel-count jamo-trailing-consonant-count))
  52. (define jamo-syllable-count
  53. (* jamo-initial-consonant-count jamo-syllable-per-count))
  54. (define jamo-syllable-end (+ jamo-syllable-start jamo-syllable-count))
  55. (define (sv-jamo-initial-consonant? sv)
  56. (and (>= sv jamo-initial-consonant-start)
  57. (< sv jamo-initial-consonant-end)))
  58. (define (sv-jamo-trailing-consonant? sv)
  59. (and (>= sv jamo-trailing-consonant-start)
  60. (< sv jamo-trailing-consonant-end)))
  61. (define (sv-jamo-vowel? sv)
  62. (and (>= sv jamo-vowel-start)
  63. (< sv jamo-vowel-end)))
  64. ;; assumes SV-HAS-CANONICAL-DECOMPOSITION? has returned #t
  65. (define (sv-canonical-decomposition-encoding s)
  66. (vector-ref *canonical-decompositions*
  67. (binary-search *canonical-decomposition-scalar-values* s)))
  68. (define (string-normalize-nfd s)
  69. (decompose #f s))
  70. (define (string-normalize-nfkd s)
  71. (decompose #t s))
  72. (define (decompose compat? s)
  73. (let ((size (string-length s)))
  74. (let loop ((i 0)
  75. (rev-chars '()))
  76. (if (>= i size)
  77. (reorder-according-to-combining-class!
  78. (list->string (reverse rev-chars)))
  79. (let* ((c (string-ref s i))
  80. (sv (char->scalar-value c)))
  81. (if (sv-hangul-syllable? sv)
  82. (loop (+ 1 i)
  83. (prepend-reverse-jamo-decomposition sv rev-chars))
  84. (loop (+ 1 i)
  85. (append (reverse-decomposition compat? sv) rev-chars))))))))
  86. (define (sv-hangul-syllable? sv)
  87. (and (>= sv jamo-syllable-start)
  88. (< sv jamo-syllable-end)))
  89. (define (prepend-reverse-jamo-decomposition sv rev-chars)
  90. (let* ((offset (- sv jamo-syllable-start))
  91. (l (+ jamo-initial-consonant-start
  92. (quotient offset jamo-syllable-per-count)))
  93. (v (+ jamo-vowel-start
  94. (quotient (modulo offset jamo-syllable-per-count)
  95. jamo-trailing-consonant-count)))
  96. (t (+ jamo-trailing-consonant-start
  97. (modulo offset jamo-trailing-consonant-count)))
  98. (either-way
  99. (cons (scalar-value->char v)
  100. (cons (scalar-value->char l)
  101. rev-chars))))
  102. (if (= t jamo-trailing-consonant-start)
  103. either-way
  104. (cons (scalar-value->char t) either-way))))
  105. (define (reverse-decomposition compat? sv)
  106. (let recur ((sv sv))
  107. (cond
  108. ((and compat? (sv-has-compatibility-decomposition? sv))
  109. (let* ((pos (binary-search *compatibility-scalar-values* sv))
  110. (end (vector-ref *compatibility-indices* (+ pos 1))))
  111. (let loop ((index (vector-ref *compatibility-indices* pos))
  112. (rev '()))
  113. (if (>= index end)
  114. rev
  115. (loop (+ 1 index)
  116. (append (recur (vector-ref *compatibility-decompositions* index))
  117. rev))))))
  118. ((sv-has-canonical-decomposition? sv)
  119. (let ((enc (sv-canonical-decomposition-encoding sv)))
  120. (cond
  121. ;; it's either a number with one or two concatenated 16-bit numbers from
  122. ;; the BMP
  123. ((number? enc)
  124. (let ((rest (recur (bitwise-and #xffff enc)))
  125. (second (bitwise-and #xffff (arithmetic-shift enc -16))))
  126. (if (zero? second)
  127. rest
  128. (append (recur second) rest))))
  129. ;; ... or a 1-element list or pair of scalar values
  130. ((null? (cdr enc))
  131. ;; 1 element
  132. (recur (car enc)))
  133. ;; 2 is max
  134. (else
  135. (append (recur (cdr enc))
  136. (recur (car enc)))))))
  137. (else
  138. (list (scalar-value->char sv))))))
  139. ; bubble-sort decompositions accoring to combining class
  140. ; returns the modified string
  141. (define (reorder-according-to-combining-class! s)
  142. (let ((size (string-length s)))
  143. (let repeat ()
  144. (let loop ((i 0)
  145. (swapped? #f))
  146. (cond
  147. ((< (+ i 1) size)
  148. (let ((sv-i (char->scalar-value (string-ref s i)))
  149. (sv-i+1 (char->scalar-value (string-ref s (+ i 1)))))
  150. (let ((cc-i (sv-combining-class sv-i))
  151. (cc-i+1 (sv-combining-class sv-i+1)))
  152. (if (and (not (zero? cc-i))
  153. (not (zero? cc-i+1))
  154. (< cc-i+1 cc-i))
  155. (begin
  156. (string-set! s i (scalar-value->char sv-i+1))
  157. (string-set! s (+ i 1) (scalar-value->char sv-i))
  158. (loop (+ 1 i) #t))
  159. (loop (+ 1 i) swapped?)))))
  160. (swapped? (repeat))
  161. (else s))))))
  162. (define (compose-2 sv-1 sv-2)
  163. (let ((encoding (bitwise-ior (arithmetic-shift sv-2 16)
  164. sv-1)))
  165. (cond
  166. ((binary-search *composition-encodings* encoding)
  167. => (lambda (index)
  168. (vector-ref *composition-scalar-values* index)))
  169. (else #f))))
  170. (define (compose! s)
  171. (let ((size (string-length s)))
  172. (let loop ((p 0) ; output index for finished combined character
  173. (p2 1) ; output index for uncombined characters
  174. (i 0) ; input index for starting character
  175. (j 1)) ; input index for characters to be combined
  176. (if (< i size)
  177. (let* ((sv-i (char->scalar-value (string-ref s i)))
  178. (cc-i (sv-combining-class sv-i)))
  179. (if (zero? cc-i)
  180. (if (= j size)
  181. (begin
  182. ;; we're done combining with sv-i; skip past
  183. ;; combining sequences in both input and output
  184. (string-set! s p (scalar-value->char sv-i))
  185. (substring s 0 (min size (max (+ p 1) p2))))
  186. (let* ((sv-j (char->scalar-value (string-ref s j)))
  187. (cc-j (sv-combining-class sv-j)))
  188. (cond
  189. ((and (= j (+ i 1))
  190. (sv-jamo-initial-consonant? sv-i)
  191. (sv-jamo-vowel? sv-j))
  192. ;; need Hangul composition
  193. (if (and (< (+ j 1) size)
  194. (sv-jamo-trailing-consonant?
  195. (char->scalar-value (string-ref s (+ j 1)))))
  196. ;; 3-char composition
  197. (let ((composite
  198. (+ jamo-syllable-start
  199. (* (- sv-i jamo-initial-consonant-start)
  200. jamo-syllable-per-count)
  201. (* (- sv-j jamo-vowel-start)
  202. jamo-trailing-consonant-count)
  203. (- (char->scalar-value (string-ref s (+ j 1)))
  204. jamo-trailing-consonant-start))))
  205. (string-set! s i (scalar-value->char composite))
  206. (loop p p2 i (+ j 2)))
  207. ;; 2-char composition
  208. (let ((composite
  209. (+ jamo-syllable-start
  210. (* (- sv-i jamo-initial-consonant-start)
  211. jamo-syllable-per-count)
  212. (* (- sv-j jamo-vowel-start)
  213. jamo-trailing-consonant-count))))
  214. (string-set! s i (scalar-value->char composite))
  215. (loop p p2 i (+ j 1)))))
  216. ((let ((previous-cc (sv-combining-class (char->scalar-value (string-ref s (- j 1))))))
  217. ;; check if blocked
  218. (and (<= previous-cc cc-j)
  219. (compose-2 sv-i sv-j)))
  220. ;; we can combine; store result temporarily at i;
  221. ;; advance past the combining mark
  222. => (lambda (combined)
  223. (string-set! s i (scalar-value->char combined))
  224. (loop p p2 i (+ j 1))))
  225. ((zero? cc-j)
  226. ;; both are combining class 0; we're done
  227. ;; combining with sv-i; skip past combining sequences
  228. ;; in both input and output
  229. (string-set! s p (scalar-value->char sv-i))
  230. (loop p2 (+ p2 1) j (+ 1 j)))
  231. (else
  232. (let skip ((j j) (p2 p2))
  233. (if (< j size)
  234. (let ((sv-j (char->scalar-value (string-ref s j))))
  235. (if (= (sv-combining-class sv-j) cc-j)
  236. (begin
  237. (string-set! s p2 (scalar-value->char sv-j))
  238. (skip (+ j 1) (+ p2 1)))
  239. (loop p p2 i j)))
  240. (loop p p2 i j)))))))
  241. (loop (+ p 1) (+ p2 1) (+ i 1) (+ j 1))))
  242. (substring s 0 (min size p2))))))
  243. (define (string-normalize-nfc s)
  244. (compose! (string-normalize-nfd s)))
  245. (define (string-normalize-nfkc s)
  246. (compose! (string-normalize-nfkd s)))