unicode.vim 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. " Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
  2. " The format of the UnicodeData.txt file is explained here:
  3. " http://www.unicode.org/Public/5.1.0/ucd/UCD.html
  4. " For the other files see the header.
  5. "
  6. " Might need to update the URL to the emoji-data.txt
  7. " Usage: Vim -S <this-file>
  8. "
  9. " Author: Bram Moolenaar
  10. " Last Update: 2020 Aug 24
  11. " Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops.
  12. func! ParseDataToProps()
  13. let s:dataprops = []
  14. let lnum = 1
  15. while lnum <= line('$')
  16. let l = split(getline(lnum), '\s*;\s*', 1)
  17. if len(l) != 15
  18. echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
  19. return
  20. endif
  21. call add(s:dataprops, l)
  22. let lnum += 1
  23. endwhile
  24. endfunc
  25. " Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops.
  26. func! ParseFoldProps()
  27. let s:foldprops = []
  28. let lnum = 1
  29. while lnum <= line('$')
  30. let line = getline(lnum)
  31. if line !~ '^#' && line !~ '^\s*$'
  32. let l = split(line, '\s*;\s*', 1)
  33. if len(l) != 4
  34. echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
  35. return
  36. endif
  37. call add(s:foldprops, l)
  38. endif
  39. let lnum += 1
  40. endwhile
  41. endfunc
  42. " Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops.
  43. func! ParseWidthProps()
  44. let s:widthprops = []
  45. let lnum = 1
  46. while lnum <= line('$')
  47. let line = getline(lnum)
  48. if line !~ '^#' && line !~ '^\s*$'
  49. let l = split(line, '\s*;\s*', 1)
  50. if len(l) != 2
  51. echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
  52. return
  53. endif
  54. call add(s:widthprops, l)
  55. endif
  56. let lnum += 1
  57. endwhile
  58. endfunc
  59. " Build the toLower or toUpper table in a new buffer.
  60. " Uses s:dataprops.
  61. func! BuildCaseTable(name, index)
  62. let start = -1
  63. let end = -1
  64. let step = 0
  65. let add = -1
  66. let ranges = []
  67. for p in s:dataprops
  68. if p[a:index] != ''
  69. let n = ('0x' . p[0]) + 0
  70. let nl = ('0x' . p[a:index]) + 0
  71. if start >= 0 && add == nl - n && (step == 0 || n - end == step)
  72. " continue with same range.
  73. let step = n - end
  74. let end = n
  75. else
  76. if start >= 0
  77. " produce previous range
  78. call Range(ranges, start, end, step, add)
  79. endif
  80. let start = n
  81. let end = n
  82. let step = 0
  83. let add = nl - n
  84. endif
  85. endif
  86. endfor
  87. if start >= 0
  88. call Range(ranges, start, end, step, add)
  89. endif
  90. " New buffer to put the result in.
  91. new
  92. exe "file to" . a:name
  93. call setline(1, "static convertStruct to" . a:name . "[] =")
  94. call setline(2, "{")
  95. call append('$', ranges)
  96. call setline('$', getline('$')[:-2]) " remove last comma
  97. call setline(line('$') + 1, "};")
  98. wincmd p
  99. endfunc
  100. " Build the foldCase table in a new buffer.
  101. " Uses s:foldprops.
  102. func! BuildFoldTable()
  103. let start = -1
  104. let end = -1
  105. let step = 0
  106. let add = -1
  107. let ranges = []
  108. for p in s:foldprops
  109. if p[1] == 'C' || p[1] == 'S'
  110. let n = ('0x' . p[0]) + 0
  111. let nl = ('0x' . p[2]) + 0
  112. if start >= 0 && add == nl - n && (step == 0 || n - end == step)
  113. " continue with same range.
  114. let step = n - end
  115. let end = n
  116. else
  117. if start >= 0
  118. " produce previous range
  119. call Range(ranges, start, end, step, add)
  120. endif
  121. let start = n
  122. let end = n
  123. let step = 0
  124. let add = nl - n
  125. endif
  126. endif
  127. endfor
  128. if start >= 0
  129. call Range(ranges, start, end, step, add)
  130. endif
  131. " New buffer to put the result in.
  132. new
  133. file foldCase
  134. call setline(1, "static convertStruct foldCase[] =")
  135. call setline(2, "{")
  136. call append('$', ranges)
  137. call setline('$', getline('$')[:-2]) " remove last comma
  138. call setline(line('$') + 1, "};")
  139. wincmd p
  140. endfunc
  141. func! Range(ranges, start, end, step, add)
  142. let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
  143. call add(a:ranges, s)
  144. endfunc
  145. " Build the combining table.
  146. " Uses s:dataprops.
  147. func! BuildCombiningTable()
  148. let start = -1
  149. let end = -1
  150. let ranges = []
  151. for p in s:dataprops
  152. " The 'Mc' property was removed, it does take up space.
  153. if p[2] == 'Mn' || p[2] == 'Me'
  154. let n = ('0x' . p[0]) + 0
  155. if start >= 0 && end + 1 == n
  156. " continue with same range.
  157. let end = n
  158. else
  159. if start >= 0
  160. " produce previous range
  161. call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
  162. endif
  163. let start = n
  164. let end = n
  165. endif
  166. endif
  167. endfor
  168. if start >= 0
  169. call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
  170. endif
  171. " New buffer to put the result in.
  172. new
  173. file combining
  174. call setline(1, " static struct interval combining[] =")
  175. call setline(2, " {")
  176. call append('$', ranges)
  177. call setline('$', getline('$')[:-2]) " remove last comma
  178. call setline(line('$') + 1, " };")
  179. wincmd p
  180. endfunc
  181. " Build the double width or ambiguous width table in a new buffer.
  182. " Uses s:widthprops and s:dataprops.
  183. func! BuildWidthTable(pattern, tableName)
  184. let start = -1
  185. let end = -1
  186. let ranges = []
  187. let dataidx = 0
  188. " Account for indentation differences between ambiguous and doublewidth
  189. " table in mbyte.c
  190. if a:pattern == 'A'
  191. let spc = ' '
  192. else
  193. let spc = "\t"
  194. endif
  195. for p in s:widthprops
  196. if p[1][0] =~ a:pattern
  197. if p[0] =~ '\.\.'
  198. " It is a range. we don't check for composing char then.
  199. let rng = split(p[0], '\.\.')
  200. if len(rng) != 2
  201. echoerr "Cannot parse range: '" . p[0] . "' in width table"
  202. endif
  203. let n = ('0x' . rng[0]) + 0
  204. let n_last = ('0x' . rng[1]) + 0
  205. else
  206. let n = ('0x' . p[0]) + 0
  207. let n_last = n
  208. endif
  209. " Find this char in the data table.
  210. while 1
  211. let dn = ('0x' . s:dataprops[dataidx][0]) + 0
  212. if dn >= n
  213. break
  214. endif
  215. let dataidx += 1
  216. endwhile
  217. if dn != n && n_last == n
  218. echoerr "Cannot find character " . n . " in data table"
  219. endif
  220. " Only use the char when it's not a composing char.
  221. " But use all chars from a range.
  222. let dp = s:dataprops[dataidx]
  223. if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
  224. if start >= 0 && end + 1 == n
  225. " continue with same range.
  226. else
  227. if start >= 0
  228. " produce previous range
  229. call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end))
  230. if a:pattern == 'A'
  231. call add(s:ambitable, [start, end])
  232. else
  233. call add(s:doubletable, [start, end])
  234. endif
  235. endif
  236. let start = n
  237. endif
  238. let end = n_last
  239. endif
  240. endif
  241. endfor
  242. if start >= 0
  243. call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end))
  244. if a:pattern == 'A'
  245. call add(s:ambitable, [start, end])
  246. else
  247. call add(s:doubletable, [start, end])
  248. endif
  249. endif
  250. " New buffer to put the result in.
  251. new
  252. exe "file " . a:tableName
  253. if a:pattern == 'A'
  254. call setline(1, "static struct interval " . a:tableName . "[] =")
  255. call setline(2, "{")
  256. else
  257. call setline(1, " static struct interval " . a:tableName . "[] =")
  258. call setline(2, " {")
  259. endif
  260. call append('$', ranges)
  261. call setline('$', getline('$')[:-2]) " remove last comma
  262. if a:pattern == 'A'
  263. call setline(line('$') + 1, "};")
  264. else
  265. call setline(line('$') + 1, " };")
  266. endif
  267. wincmd p
  268. endfunc
  269. " Get characters from a list of lines in form "12ab .." or "12ab..56cd ..."
  270. " and put them in dictionary "chardict"
  271. func AddLinesToCharDict(lines, chardict)
  272. for line in a:lines
  273. let tokens = split(line, '\.\.')
  274. let first = str2nr(tokens[0], 16)
  275. if len(tokens) == 1
  276. let last = first
  277. else
  278. let last = str2nr(tokens[1], 16)
  279. endif
  280. for nr in range(first, last)
  281. let a:chardict[nr] = 1
  282. endfor
  283. endfor
  284. endfunc
  285. func Test_AddLinesToCharDict()
  286. let dict = {}
  287. call AddLinesToCharDict([
  288. \ '1234 blah blah',
  289. \ '1235 blah blah',
  290. \ '12a0..12a2 blah blah',
  291. \ '12a1 blah blah',
  292. \ ], dict)
  293. call assert_equal({0x1234: 1, 0x1235: 1,
  294. \ 0x12a0: 1, 0x12a1: 1, 0x12a2: 1,
  295. \ }, dict)
  296. if v:errors != []
  297. echoerr 'AddLinesToCharDict' v:errors
  298. return 1
  299. endif
  300. return 0
  301. endfunc
  302. func CharDictToPairList(chardict)
  303. let result = []
  304. let keys = keys(a:chardict)->map('str2nr(v:val)')->sort('N')
  305. let low = keys[0]
  306. let high = keys[0]
  307. for key in keys
  308. if key > high + 1
  309. call add(result, [low, high])
  310. let low = key
  311. let high = key
  312. else
  313. let high = key
  314. endif
  315. endfor
  316. call add(result, [low, high])
  317. return result
  318. endfunc
  319. func Test_CharDictToPairList()
  320. let dict = {0x1020: 1, 0x1021: 1, 0x1022: 1,
  321. \ 0x1024: 1,
  322. \ 0x2022: 1,
  323. \ 0x2024: 1, 0x2025: 1}
  324. call assert_equal([
  325. \ [0x1020, 0x1022],
  326. \ [0x1024, 0x1024],
  327. \ [0x2022, 0x2022],
  328. \ [0x2024, 0x2025],
  329. \ ], CharDictToPairList(dict))
  330. if v:errors != []
  331. echoerr 'CharDictToPairList' v:errors
  332. return 1
  333. endif
  334. return 0
  335. endfunc
  336. " Build the amoji width table in a new buffer.
  337. func BuildEmojiTable()
  338. " First make the table for all emojis.
  339. let pattern = '; Emoji\s\+#\s'
  340. let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
  341. " Make a dictionary with an entry for each character.
  342. let chardict = {}
  343. call AddLinesToCharDict(lines, chardict)
  344. let pairlist = CharDictToPairList(chardict)
  345. let allranges = map(pairlist, 'printf(" {0x%04x, 0x%04x},", v:val[0], v:val[1])')
  346. " New buffer to put the result in.
  347. new
  348. exe 'file emoji_all'
  349. call setline(1, "static struct interval emoji_all[] =")
  350. call setline(2, "{")
  351. call append('$', allranges)
  352. call setline('$', getline('$')[:-2]) " remove last comma
  353. call setline(line('$') + 1, "};")
  354. wincmd p
  355. " Make the table for wide emojis.
  356. let pattern = '; Emoji_\(Presentation\|Modifier_Base\)\s\+#\s'
  357. let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
  358. " Make a dictionary with an entry for each character.
  359. let chardict = {}
  360. call AddLinesToCharDict(lines, chardict)
  361. " exclude characters that are in the "ambiguous" or "doublewidth" table
  362. for ambi in s:ambitable
  363. for nr in range(ambi[0], ambi[1])
  364. if has_key(chardict, nr)
  365. call remove(chardict, nr)
  366. endif
  367. endfor
  368. endfor
  369. for wide in s:doubletable
  370. for nr in range(wide[0], wide[1])
  371. if has_key(chardict, nr)
  372. call remove(chardict, nr)
  373. endif
  374. endfor
  375. endfor
  376. let pairlist = CharDictToPairList(chardict)
  377. let wide_ranges = map(pairlist, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
  378. " New buffer to put the result in.
  379. new
  380. exe 'file emoji_wide'
  381. call setline(1, " static struct interval emoji_wide[] =")
  382. call setline(2, " {")
  383. call append('$', wide_ranges)
  384. call setline('$', getline('$')[:-2]) " remove last comma
  385. call setline(line('$') + 1, " };")
  386. wincmd p
  387. endfunc
  388. " First test a few things
  389. let v:errors = []
  390. if Test_AddLinesToCharDict() || Test_CharDictToPairList()
  391. finish
  392. endif
  393. " Try to avoid hitting E36
  394. set equalalways
  395. " Edit the Unicode text file. Requires the netrw plugin.
  396. edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
  397. " Parse each line, create a list of lists.
  398. call ParseDataToProps()
  399. " Build the toLower table.
  400. call BuildCaseTable("Lower", 13)
  401. " Build the toUpper table.
  402. call BuildCaseTable("Upper", 12)
  403. " Build the ranges of composing chars.
  404. call BuildCombiningTable()
  405. " Edit the case folding text file. Requires the netrw plugin.
  406. edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
  407. " Parse each line, create a list of lists.
  408. call ParseFoldProps()
  409. " Build the foldCase table.
  410. call BuildFoldTable()
  411. " Edit the width text file. Requires the netrw plugin.
  412. edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
  413. " Parse each line, create a list of lists.
  414. call ParseWidthProps()
  415. " Build the double width table.
  416. let s:doubletable = []
  417. call BuildWidthTable('[WF]', 'doublewidth')
  418. " Build the ambiguous width table.
  419. let s:ambitable = []
  420. call BuildWidthTable('A', 'ambiguous')
  421. " Edit the emoji text file. Requires the netrw plugin.
  422. " commented out, because it drops too many characters
  423. "edit https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt
  424. "
  425. "" Build the emoji table. Ver. 1.0 - 6.0
  426. "" Must come after the "ambiguous" and "doublewidth" tables
  427. "call BuildEmojiTable()