encoding.scm 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378
  1. ; Part of Scheme 48 1.9. See file COPYING for notices and license.
  2. ; Authors: Mike Sperber
  3. ; Character/string encodings
  4. ; We abstract over the primitive encode-char/decode-char characters to
  5. ; get two sets of procedures, one going through the general
  6. ; text-encoding infrastructure, and the other making use of the VM
  7. ; instructions.
  8. (define-syntax define-coding-procs
  9. (syntax-rules ()
  10. ((define-coding-procs
  11. (do-encode-char do-decode-char)
  12. char-encoding-length
  13. string-encoding-length
  14. encode-char
  15. encode-string
  16. string->bytes-n
  17. string->bytes
  18. decode-char
  19. bytes-string-size
  20. decode-string
  21. bytes->string
  22. bytes->string-n)
  23. (begin
  24. (define (char-encoding-length enc c)
  25. (call-with-values
  26. (lambda ()
  27. (do-encode-char enc c empty-buffer 0 0))
  28. (lambda (ok? count)
  29. count)))
  30. (define (string-encoding-length enc s start-index count)
  31. (let loop ((enc-length 0)
  32. (char-index 0))
  33. (if (>= char-index count)
  34. enc-length
  35. (loop (+ enc-length
  36. (char-encoding-length enc (string-ref s (+ start-index char-index))))
  37. (+ 1 char-index)))))
  38. ; returns byte count of the encoding
  39. (define (encode-char enc c target target-start)
  40. (call-with-values
  41. (lambda ()
  42. (do-encode-char enc
  43. c target target-start
  44. (- (byte-vector-length target) target-start)))
  45. (lambda (ok? count)
  46. count)))
  47. ; Will only produce complete encodings
  48. ; returns three values:
  49. ; - encoding status
  50. ; - # characters consumed
  51. ; - # bytes decoded
  52. (define (encode-string enc source source-start source-count
  53. target target-start target-count)
  54. (let loop ((source-index 0)
  55. (target-index 0))
  56. (cond
  57. ((>= source-index source-count)
  58. (values (enum encoding-status complete)
  59. source-index
  60. target-index))
  61. ((>= target-index target-count)
  62. (values (enum encoding-status insufficient)
  63. source-index
  64. target-index))
  65. (else
  66. (let ((c (string-ref source (+ source-start source-index))))
  67. (call-with-values
  68. (lambda ()
  69. (do-encode-char enc
  70. c
  71. target (+ target-start target-index)
  72. (max 0 (- target-count target-index))))
  73. (lambda (ok? count)
  74. (if (not ok?)
  75. (values (enum encoding-status insufficient)
  76. source-index
  77. target-index)
  78. (loop (+ source-index 1) (+ target-index count))))))))))
  79. (define (string->bytes-n enc s start count)
  80. (let* ((size (string-encoding-length enc s 0 count))
  81. (result (make-byte-vector size 0)))
  82. (encode-string enc s 0 count result 0 size)
  83. result))
  84. (define (string->bytes enc s)
  85. (string->bytes-n enc s 0 (string-length s)))
  86. ; Decoding
  87. ; Returns three values:
  88. ; - decoding status
  89. ; - character if status is COMPLETE, else #f
  90. ; - # bytes consumed if COMPLETE or INCOMPLETE, else #f
  91. (define (decode-char enc bytes start-index count)
  92. (call-with-values
  93. (lambda ()
  94. (do-decode-char enc
  95. bytes start-index
  96. count))
  97. (lambda (maybe-char count)
  98. (cond
  99. (maybe-char
  100. (values (enum decoding-status complete)
  101. maybe-char
  102. count))
  103. (count
  104. (values (enum decoding-status incomplete)
  105. #f
  106. count))
  107. (else
  108. (values (enum decoding-status invalid)
  109. #f
  110. #f))))))
  111. ; If STOP-AT-INVALID? is #f, we'll skip an invalid byte, and pretend
  112. ; it generated one character.
  113. ; Returns three values:
  114. ; - :DECODING-STATUS object
  115. ; - # bytes consumed
  116. ; - # characters decoded
  117. (define (bytes-string-size enc bytes start count stop-at-invalid?)
  118. (let loop ((index 0)
  119. (target-index 0))
  120. (if (>= index count)
  121. (values (enum decoding-status complete)
  122. index target-index)
  123. (call-with-values
  124. (lambda ()
  125. (do-decode-char enc
  126. bytes
  127. (+ start index)
  128. (- count index)))
  129. (lambda (char count)
  130. (cond
  131. (char
  132. (loop (+ index count) (+ target-index 1)))
  133. (count
  134. (values (enum decoding-status incomplete)
  135. index target-index))
  136. (stop-at-invalid?
  137. (values (enum decoding-status invalid)
  138. index target-index))
  139. (else
  140. (loop (+ 1 index) (+ 1 target-index)))))))))
  141. ; Returns three values:
  142. ; - :DECODING-STATUS object
  143. ; - # bytes consumed
  144. ; - # characters decoded
  145. (define (decode-string enc
  146. bytes start count
  147. target target-start target-count
  148. maybe-error-char)
  149. (let loop ((index 0)
  150. (target-index 0))
  151. (cond
  152. ((>= index count)
  153. (values (enum decoding-status complete)
  154. index
  155. target-index))
  156. ((>= target-index target-count)
  157. (values (enum decoding-status insufficient)
  158. index target-index))
  159. (else
  160. (call-with-values
  161. (lambda ()
  162. (do-decode-char enc
  163. bytes
  164. (+ start index)
  165. (- count index)))
  166. (lambda (char count)
  167. (cond
  168. (char
  169. (string-set! target (+ target-start target-index) char)
  170. (loop (+ index count) (+ target-index 1)))
  171. (count
  172. (values (enum decoding-status incomplete)
  173. index target-index))
  174. (maybe-error-char
  175. (string-set! target (+ target-start target-index) maybe-error-char)
  176. (loop (+ 1 index) (+ 1 target-index)))
  177. (else
  178. (values (enum decoding-status invalid)
  179. index target-index)))))))))
  180. ; may be slightly faster because of REVERSE-LIST->STRING
  181. ; If MAYBE-ERROR-CHAR is #f, we'll raise an error upon an invalid encoding
  182. ; If it's a character, it will be used at invalid *and incomplete* encodings
  183. (define (bytes->string enc source maybe-error-char)
  184. (bytes->string-n enc source 0 (byte-vector-length source) maybe-error-char))
  185. (define (bytes->string-n enc source start source-count maybe-error-char)
  186. (let loop ((rev-chars '())
  187. (char-count 0)
  188. (source-index 0))
  189. (if (>= source-index source-count)
  190. (reverse-list->string rev-chars char-count)
  191. (call-with-values
  192. (lambda ()
  193. (do-decode-char enc
  194. source
  195. (+ start source-index)
  196. (- source-count source-index)))
  197. (lambda (char count)
  198. (cond
  199. (char
  200. (loop (cons char rev-chars)
  201. (+ 1 char-count)
  202. (+ count source-index)))
  203. (maybe-error-char
  204. (loop (cons maybe-error-char rev-chars)
  205. (+ 1 char-count)
  206. (+ 1 source-index)))
  207. (count
  208. (decoding-error enc ; ####
  209. "incomplete encoding"
  210. source (+ start source-index)))
  211. (else
  212. (decoding-error enc ; ####
  213. "invalid encoding"
  214. source (+ start source-index)))))))))
  215. ))))
  216. (define-coding-procs (char->utf utf->char)
  217. char-encoding-length/encoding
  218. string-encoding-length/encoding
  219. encode-char/encoding
  220. encode-string/encoding
  221. string->bytes-n/encoding
  222. string->bytes/encoding
  223. decode-char/encoding
  224. bytes-string-size/encoding
  225. decode-string/encoding
  226. bytes->string/encoding
  227. bytes->string-n/encoding)
  228. (define-syntax primitive-encode-char/text-codec
  229. (syntax-rules ()
  230. ((encode-char/text-codec enc ch buffer start count)
  231. (atomically
  232. ((text-codec-encode-char-proc enc) ch buffer start count)))))
  233. (define-syntax primitive-decode-char/text-codec
  234. (syntax-rules ()
  235. ((decode-char/text-codec enc buffer start count)
  236. (atomically
  237. ((text-codec-decode-char-proc enc) buffer start count)))))
  238. (define-coding-procs (primitive-encode-char/text-codec primitive-decode-char/text-codec)
  239. char-encoding-length/text-codec
  240. string-encoding-length/text-codec
  241. encode-char/text-codec
  242. encode-string/text-codec
  243. string->bytes-n/text-codec
  244. string->bytes/text-codec
  245. decode-char/text-codec
  246. bytes-string-size/text-codec
  247. decode-string/text-codec
  248. bytes->string/text-codec
  249. bytes->string-n/text-codec)
  250. (define-syntax define-text-codec-proc
  251. (syntax-rules ()
  252. ((define-text-codec-proc (?name ?arg ...) ?name/codec ?name/encoding)
  253. (define (?name codec ?arg ...)
  254. (let ((spec (text-codec->spec codec)))
  255. (if (text-codec? spec)
  256. (?name/codec spec ?arg ...)
  257. (?name/encoding spec ?arg ...)))))))
  258. (define-text-codec-proc (char-encoding-length c)
  259. char-encoding-length/text-codec char-encoding-length/encoding)
  260. (define-text-codec-proc (string-encoding-length s start-index count)
  261. string-encoding-length/text-codec string-encoding-length/encoding)
  262. (define-text-codec-proc (encode-char c target target-start)
  263. encode-char/text-codec encode-char/encoding)
  264. (define-text-codec-proc (encode-string source source-start source-count
  265. target target-start target-count)
  266. encode-string/text-codec encode-string/encoding)
  267. (define-text-codec-proc (string->bytes-n s start count)
  268. string->bytes-n/text-codec string->bytes-n/encoding)
  269. (define-text-codec-proc (string->bytes s)
  270. string->bytes/text-codec string->bytes/encoding)
  271. (define-text-codec-proc (decode-char bytes start-index count)
  272. decode-char/text-codec decode-char/encoding)
  273. (define-text-codec-proc (bytes-string-size bytes start count stop-at-invalid?)
  274. bytes-string-size/text-codec bytes-string-size/encoding)
  275. (define-text-codec-proc (decode-string bytes start count
  276. target target-start target-count
  277. maybe-error-char)
  278. decode-string/text-codec decode-string/encoding)
  279. (define-text-codec-proc (bytes->string source maybe-error-char)
  280. bytes->string/text-codec bytes->string/encoding)
  281. (define-text-codec-proc (bytes->string-n source start source-count maybe-error-char)
  282. bytes->string-n/text-codec bytes->string-n/encoding)
  283. ;; Utilities
  284. (define empty-buffer (make-byte-vector 0 0))
  285. (define-enumeration encoding-status
  286. (complete insufficient))
  287. (define (decoding-error encoding-name
  288. message
  289. bytes start)
  290. (raise
  291. (make-message-condition
  292. (string-append "error while decoding " encoding-name ": " message))
  293. (make-decoding-error encoding-name
  294. bytes start)))
  295. (define-enumeration decoding-status
  296. (complete incomplete insufficient invalid))
  297. ;; UTF-8
  298. (define (char-encoding-length/utf-8 c)
  299. (char-encoding-length/encoding (enum text-encoding-option utf-8) c))
  300. (define (string-encoding-length/utf-8 s start-index count)
  301. (string-encoding-length/encoding (enum text-encoding-option utf-8)
  302. s start-index count))
  303. (define (encode-char/utf-8 c target target-start)
  304. (encode-char/encoding (enum text-encoding-option utf-8) c target target-start))
  305. (define (encode-string/utf-8 source source-start source-count
  306. target target-start target-count)
  307. (encode-string/encoding (enum text-encoding-option utf-8)
  308. source source-start source-count
  309. target target-start target-count))
  310. (define (string->utf-8-n s start count)
  311. (string->bytes-n/encoding (enum text-encoding-option utf-8) s start count))
  312. (define (string->utf-8 s)
  313. (string->bytes/encoding (enum text-encoding-option utf-8) s))
  314. (define (decode-char/utf-8 bytes start-index count)
  315. (decode-char/encoding (enum text-encoding-option utf-8) bytes start-index count))
  316. (define (bytes-string-size/utf-8 bytes start count stop-at-invalid?)
  317. (bytes-string-size/encoding (enum text-encoding-option utf-8)
  318. bytes start count stop-at-invalid?))
  319. (define (decode-string/utf-8 bytes start count
  320. target target-start target-count
  321. maybe-error-char)
  322. (decode-string/encoding (enum text-encoding-option utf-8)
  323. bytes start count
  324. target target-start target-count
  325. maybe-error-char))
  326. (define (utf-8->string source maybe-error-char)
  327. (bytes->string/encoding (enum text-encoding-option utf-8)
  328. source maybe-error-char))
  329. (define (utf-8->string-n source start source-count maybe-error-char)
  330. (bytes->string-n/encoding (enum text-encoding-option utf-8)
  331. source start source-count maybe-error-char))