query.go 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. // License: GPLv3 Copyright: 2023, Kovid Goyal, <kovid at kovidgoyal.net>
  2. package unicode_names
  3. import (
  4. "bytes"
  5. _ "embed"
  6. "encoding/binary"
  7. "fmt"
  8. "strings"
  9. "sync"
  10. "time"
  11. "kitty/tools/utils"
  12. "kitty/tools/utils/images"
  13. )
  14. type mark_set = *utils.Set[uint16]
  15. //go:embed data_generated.bin
  16. var unicode_name_data string
  17. var _ = fmt.Print
  18. var names map[rune]string
  19. var marks []rune
  20. var word_map map[string][]uint16
  21. func add_word(codepoint uint16, word []byte) {
  22. if codepoint <= 32 || codepoint == 127 || (128 <= codepoint && codepoint <= 159) || len(word) < 2 {
  23. return
  24. }
  25. w := utils.UnsafeBytesToString(word)
  26. word_map[w] = append(word_map[w], codepoint)
  27. }
  28. func add_words(codepoint uint16, raw []byte) {
  29. for len(raw) > 0 {
  30. idx := bytes.IndexByte(raw, ' ')
  31. if idx < 0 {
  32. add_word(codepoint, raw)
  33. break
  34. }
  35. if idx > 0 {
  36. add_word(codepoint, raw[:idx])
  37. }
  38. raw = raw[idx+1:]
  39. }
  40. }
  41. func parse_record(record []byte, mark uint16) {
  42. codepoint := rune(binary.LittleEndian.Uint32(record))
  43. record = record[4:]
  44. marks[mark] = codepoint
  45. namelen := binary.LittleEndian.Uint16(record)
  46. record = record[2:]
  47. name := utils.UnsafeBytesToString(record[:namelen])
  48. names[codepoint] = name
  49. add_words(mark, record[:namelen])
  50. if len(record) > int(namelen) {
  51. add_words(mark, record[namelen:])
  52. }
  53. }
  54. var parse_once sync.Once
  55. func parse_data() {
  56. raw := utils.ReadCompressedEmbeddedData(unicode_name_data)
  57. num_of_lines := binary.LittleEndian.Uint32(raw)
  58. raw = raw[4:]
  59. num_of_words := binary.LittleEndian.Uint32(raw)
  60. raw = raw[4:]
  61. names = make(map[rune]string, num_of_lines)
  62. word_map = make(map[string][]uint16, num_of_words)
  63. marks = make([]rune, num_of_lines)
  64. var mark uint16
  65. for len(raw) > 0 {
  66. record_len := binary.LittleEndian.Uint16(raw)
  67. raw = raw[2:]
  68. parse_record(raw[:record_len], mark)
  69. mark += 1
  70. raw = raw[record_len:]
  71. }
  72. }
  73. func Initialize() {
  74. parse_once.Do(parse_data)
  75. }
  76. func NameForCodePoint(cp rune) string {
  77. Initialize()
  78. return names[cp]
  79. }
  80. func find_matching_codepoints(prefix string) (ans mark_set) {
  81. for q, marks := range word_map {
  82. if strings.HasPrefix(q, prefix) {
  83. if ans == nil {
  84. ans = utils.NewSet[uint16](len(marks) * 2)
  85. }
  86. ans.AddItems(marks...)
  87. }
  88. }
  89. return ans
  90. }
  91. func marks_for_query(query string) (ans mark_set) {
  92. Initialize()
  93. prefixes := strings.Split(strings.ToLower(query), " ")
  94. results := make(chan mark_set, len(prefixes))
  95. ctx := images.Context{}
  96. ctx.Parallel(0, len(prefixes), func(nums <-chan int) {
  97. for i := range nums {
  98. results <- find_matching_codepoints(prefixes[i])
  99. }
  100. })
  101. close(results)
  102. for x := range results {
  103. if ans == nil {
  104. ans = x
  105. } else {
  106. ans = ans.Intersect(x)
  107. }
  108. }
  109. if ans == nil {
  110. ans = utils.NewSet[uint16](0)
  111. }
  112. return
  113. }
  114. func CodePointsForQuery(query string) (ans []rune) {
  115. x := marks_for_query(query)
  116. ans = make([]rune, x.Len())
  117. i := 0
  118. for m := range x.Iterable() {
  119. ans[i] = marks[m]
  120. i += 1
  121. }
  122. return
  123. }
  124. func Develop() {
  125. start := time.Now()
  126. Initialize()
  127. fmt.Println("Parsing unicode name data took:", time.Since(start))
  128. start = time.Now()
  129. num := CodePointsForQuery("arr")
  130. fmt.Println("Querying arr took:", time.Since(start), "and found:", len(num))
  131. start = time.Now()
  132. num = CodePointsForQuery("arr right")
  133. fmt.Println("Querying arr right took:", time.Since(start), "and found:", len(num))
  134. }