wtf8.wat 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. (type $wtf8 (array (mut i8)))
  2. (type $iter
  3. (struct (field $wtf8 (ref $wtf8))
  4. (field $pos (mut i32))))
  5. (type $builder
  6. (struct (field $wtf8 (mut (ref $wtf8)))
  7. (field $pos (mut i32))))
  8. (memory $decoder 1)
  9. (data
  10. (memory $decoder)
  11. (i32.const 0)
  12. ;; Generalized UTF-8 decoder is a translation of:
  13. ;; https://chromium.googlesource.com/v8/v8/+/main/src/third_party/utf8-decoder/generalized-utf8-decoder.h
  14. ;;
  15. ;; Transitions:
  16. ;; This first table maps bytes to character to a transition.
  17. ;;
  18. ;; The transition value takes a state to a new state, but it also determines
  19. ;; the set of bits from the current byte that contribute to the decoded
  20. ;; codepoint:
  21. ;;
  22. ;; Transition | Current byte bits that contribute to decoded codepoint
  23. ;; -------------------------------------------------------------------
  24. ;; 0, 1 | #b01111111
  25. ;; 2, 3 | #b00111111
  26. ;; 4, 5 | #b00011111
  27. ;; 6, 7 | #b00001111
  28. ;; 8, 9 | #b00000111
  29. ;; 10 | #b00000011
  30. ;;
  31. ;; Given the WTF-8 encoding, we therefore have the following constraints:
  32. ;;
  33. ;; 1. The transition value for 1-byte encodings should have the value 0 or
  34. ;; 1 so that we preserve all of the low 7 bits.
  35. ;; 2. Continuation bytes (#x80 to #xBF) are of the form #b10xxxxxx, and
  36. ;; therefore should have transition value between 0 and 3.
  37. ;; 3. Leading bytes for 2-byte encodings are of the form #b110yyyyy, and
  38. ;; therefore the transition value can be between 2 and 5.
  39. ;; 4. Leading bytes for 3-byte encodings (#b1110zzzz) need transition
  40. ;; value between 4 and 7.
  41. ;; 5. Leading bytes for 4-byte encodings (#b11110uuu) need transition
  42. ;; value between 6 and 9.
  43. ;; 6. We need more states to impose irregular constraints. Sometimes we
  44. ;; can use the knowldege that e.g. some high significant bits of the
  45. ;; xxxx in #b1110xxxx are 0, then we can use a higher transition value.
  46. ;; 7. Transitions to invalid states can use any transition value.
  47. #vu8(0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ;; 00-0F
  48. 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ;; 10-1F
  49. 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ;; 20-2F
  50. 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ;; 30-3F
  51. 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ;; 40-4F
  52. 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ;; 50-5F
  53. 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ;; 60-6F
  54. 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ;; 70-7F
  55. 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ;; 80-8F
  56. 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ;; 90-9F
  57. 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ;; A0-AF
  58. 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ;; B0-BF
  59. 8 8 4 4 4 4 4 4 4 4 4 4 4 4 4 4 ;; C0-CF
  60. 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 ;; D0-DF
  61. 9 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 ;; E0-EF
  62. 10 6 6 6 7 8 8 8 8 8 8 8 8 8 8 8) ;; F0-FF
  63. ;; This second table maps a state to a new state when adding a transition.
  64. ;; 00-7F
  65. ;; | 80-8F
  66. ;; | | 90-9F
  67. ;; | | | A0-BF
  68. ;; | | | | C2-DF
  69. ;; | | | | | E1-EF
  70. ;; | | | | | | F1-F3
  71. ;; | | | | | | | F4
  72. ;; | | | | | | | | C0, C1, F5-FF
  73. ;; | | | | | | | | | E0
  74. ;; | | | | | | | | | | F0
  75. #vu8(0 0 0 0 0 0 0 0 0 0 0 ;; REJECT = 0
  76. 11 0 0 0 22 33 44 55 0 66 77 ;; ACCEPT = 11
  77. 0 11 11 11 0 0 0 0 0 0 0 ;; 2-byte = 22
  78. 0 22 22 22 0 0 0 0 0 0 0 ;; 3-byte = 33
  79. 0 33 33 33 0 0 0 0 0 0 0 ;; 4-byte = 44
  80. 0 33 0 0 0 0 0 0 0 0 0 ;; 4-byte low = 55
  81. 0 0 0 22 0 0 0 0 0 0 0 ;; 3-byte high = 66
  82. 0 0 33 33 0 0 0 0 0 0 0)) ;; 4-byte mid/high = 77
  83. (func $decode-wtf8 (param $byte i32) (param $buf i32) (param $state i32)
  84. (result i32 i32) ; codepoint, state
  85. (local $type i32)
  86. (local.set $type (i32.load8_u $decoder (local.get $byte)))
  87. ;; Two values: first push the codepoint
  88. (i32.or (i32.shl (local.get $buf) (i32.const 6))
  89. (i32.and (local.get $byte)
  90. (i32.shr_u (i32.const #x7f)
  91. (i32.shr_u (local.get $type)
  92. (i32.const 1)))))
  93. ;; Then the state
  94. (i32.load8_u $decoder offset=256
  95. (i32.add (local.get $state) (local.get $type))))
  96. (func $string_iter (export "as_iter")
  97. (param $wtf8 (ref $wtf8)) (result (ref $iter))
  98. (struct.new $iter (local.get $wtf8) (i32.const 0)))
  99. (func $iter_next (export "iter_next")
  100. (param $iter (ref $iter))
  101. (result i32)
  102. (local $wtf8 (ref $wtf8))
  103. (local $cp i32)
  104. (local $state i32)
  105. (local $i i32)
  106. (local.set $wtf8 (struct.get $iter $wtf8 (local.get $iter)))
  107. (local.set $i (struct.get $iter $pos (local.get $iter)))
  108. (local.set $state (i32.const 11)) ;; ACCEPT
  109. (if (i32.ge_u (local.get $i) (array.len (local.get $wtf8)))
  110. (then (return (i32.const -1))))
  111. (loop $lp
  112. (if (i32.ge_u (local.get $i) (array.len (local.get $wtf8)))
  113. ;; Bad WTF-8.
  114. (then (unreachable)))
  115. (call $decode-wtf8
  116. (array.get_u $wtf8 (local.get $wtf8) (local.get $i))
  117. (local.get $cp)
  118. (local.get $state))
  119. (local.set $state)
  120. (local.set $cp)
  121. ;; Must be valid WTF-8!
  122. (if (i32.eq (local.get $state) (i32.const 0)) ;; REJECT
  123. (then (unreachable)))
  124. (local.set $i (i32.add (local.get $i) (i32.const 1)))
  125. (if (i32.ne (local.get $state) (i32.const 11)) ;; ACCEPT
  126. (then (br $lp))))
  127. (struct.set $iter $pos (local.get $iter) (local.get $i))
  128. (local.get $cp))
  129. (func $string_builder (export "string_builder")
  130. (result (ref $builder))
  131. (struct.new $builder
  132. (array.new_default $wtf8 (i32.const 256))
  133. (i32.const 0)))
  134. (func $builder_push_codepoint (export "builder_push_codepoint")
  135. (param $builder (ref $builder))
  136. (param $cp i32)
  137. (local $wtf8 (ref $wtf8))
  138. (local $pos i32)
  139. (local.set $wtf8 (struct.get $builder $wtf8 (local.get $builder)))
  140. (local.set $pos (struct.get $builder $pos (local.get $builder)))
  141. (if (i32.lt_u (array.len (local.get $wtf8))
  142. (i32.add (i32.const 4) (local.get $pos)))
  143. (then
  144. (local.set $wtf8
  145. (array.new_default
  146. $wtf8
  147. (i32.shl (array.len (local.get $wtf8)) (i32.const 1))))
  148. (array.copy $wtf8 $wtf8
  149. (local.get $wtf8) (i32.const 0)
  150. (struct.get $builder $wtf8 (local.get $builder))
  151. (i32.const 0) (local.get $pos))
  152. (struct.set $builder $wtf8 (local.get $builder)
  153. (local.get $wtf8))))
  154. (if (i32.le_u (local.get $cp) (i32.const #x7f))
  155. (then
  156. (array.set $wtf8 (local.get $wtf8) (local.get $pos)
  157. (local.get $cp))
  158. (struct.set $builder $pos (local.get $builder)
  159. (i32.add (local.get $pos) (i32.const 1)))
  160. (return)))
  161. (if (i32.le_u (local.get $cp) (i32.const #x7ff))
  162. (then
  163. (array.set $wtf8 (local.get $wtf8) (local.get $pos)
  164. (i32.or (i32.shr_u (local.get $cp) (i32.const 6))
  165. (i32.const #b11000000)))
  166. (array.set $wtf8 (local.get $wtf8)
  167. (i32.add (local.get $pos) (i32.const 1))
  168. (i32.or (i32.and (local.get $cp) (i32.const #b00111111))
  169. (i32.const #b10000000)))
  170. (struct.set $builder $pos (local.get $builder)
  171. (i32.add (local.get $pos) (i32.const 2)))
  172. (return)))
  173. (if (i32.le_u (local.get $cp) (i32.const #xffff))
  174. (then
  175. (array.set $wtf8 (local.get $wtf8) (local.get $pos)
  176. (i32.or (i32.shr_u (local.get $cp) (i32.const 12))
  177. (i32.const #b11100000)))
  178. (array.set $wtf8 (local.get $wtf8)
  179. (i32.add (local.get $pos) (i32.const 1))
  180. (i32.or (i32.and (i32.shr_u (local.get $cp) (i32.const 6))
  181. (i32.const #b00111111))
  182. (i32.const #b10000000)))
  183. (array.set $wtf8 (local.get $wtf8)
  184. (i32.add (local.get $pos) (i32.const 2))
  185. (i32.or (i32.and (local.get $cp) (i32.const #b00111111))
  186. (i32.const #b10000000)))
  187. (struct.set $builder $pos (local.get $builder)
  188. (i32.add (local.get $pos) (i32.const 3)))
  189. (return)))
  190. (array.set $wtf8 (local.get $wtf8) (local.get $pos)
  191. (i32.or (i32.shr_u (local.get $cp) (i32.const 18))
  192. (i32.const #b11110000)))
  193. (array.set $wtf8 (local.get $wtf8)
  194. (i32.add (local.get $pos) (i32.const 1))
  195. (i32.or (i32.and (i32.shr_u (local.get $cp) (i32.const 12))
  196. (i32.const #b00111111))
  197. (i32.const #b10000000)))
  198. (array.set $wtf8 (local.get $wtf8)
  199. (i32.add (local.get $pos) (i32.const 2))
  200. (i32.or (i32.and (i32.shr_u (local.get $cp) (i32.const 6))
  201. (i32.const #b00111111))
  202. (i32.const #b10000000)))
  203. (array.set $wtf8 (local.get $wtf8)
  204. (i32.add (local.get $pos) (i32.const 3))
  205. (i32.or (i32.and (local.get $cp) (i32.const #b00111111))
  206. (i32.const #b10000000)))
  207. (struct.set $builder $pos (local.get $builder)
  208. (i32.add (local.get $pos) (i32.const 4))))
  209. (func $builder_finish (export "finish_builder")
  210. (param $builder (ref $builder)) (result (ref $wtf8))
  211. (local $wtf8 (ref $wtf8))
  212. (local $len i32)
  213. (local.set $len (struct.get $builder $pos (local.get $builder)))
  214. (local.set $wtf8 (array.new_default $wtf8 (local.get $len)))
  215. (array.copy $wtf8 $wtf8
  216. (local.get $wtf8) (i32.const 0)
  217. (struct.get $builder $wtf8 (local.get $builder))
  218. (i32.const 0) (local.get $len))
  219. (local.get $wtf8))