123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151 |
- // License: GPLv3 Copyright: 2023, Kovid Goyal, <kovid at kovidgoyal.net>
- package unicode_names
- import (
- "bytes"
- _ "embed"
- "encoding/binary"
- "fmt"
- "strings"
- "sync"
- "time"
- "kitty/tools/utils"
- "kitty/tools/utils/images"
- )
- type mark_set = *utils.Set[uint16]
- //go:embed data_generated.bin
- var unicode_name_data string
- var _ = fmt.Print
- var names map[rune]string
- var marks []rune
- var word_map map[string][]uint16
- func add_word(codepoint uint16, word []byte) {
- if codepoint <= 32 || codepoint == 127 || (128 <= codepoint && codepoint <= 159) || len(word) < 2 {
- return
- }
- w := utils.UnsafeBytesToString(word)
- word_map[w] = append(word_map[w], codepoint)
- }
- func add_words(codepoint uint16, raw []byte) {
- for len(raw) > 0 {
- idx := bytes.IndexByte(raw, ' ')
- if idx < 0 {
- add_word(codepoint, raw)
- break
- }
- if idx > 0 {
- add_word(codepoint, raw[:idx])
- }
- raw = raw[idx+1:]
- }
- }
- func parse_record(record []byte, mark uint16) {
- codepoint := rune(binary.LittleEndian.Uint32(record))
- record = record[4:]
- marks[mark] = codepoint
- namelen := binary.LittleEndian.Uint16(record)
- record = record[2:]
- name := utils.UnsafeBytesToString(record[:namelen])
- names[codepoint] = name
- add_words(mark, record[:namelen])
- if len(record) > int(namelen) {
- add_words(mark, record[namelen:])
- }
- }
- var parse_once sync.Once
- func parse_data() {
- raw := utils.ReadCompressedEmbeddedData(unicode_name_data)
- num_of_lines := binary.LittleEndian.Uint32(raw)
- raw = raw[4:]
- num_of_words := binary.LittleEndian.Uint32(raw)
- raw = raw[4:]
- names = make(map[rune]string, num_of_lines)
- word_map = make(map[string][]uint16, num_of_words)
- marks = make([]rune, num_of_lines)
- var mark uint16
- for len(raw) > 0 {
- record_len := binary.LittleEndian.Uint16(raw)
- raw = raw[2:]
- parse_record(raw[:record_len], mark)
- mark += 1
- raw = raw[record_len:]
- }
- }
- func Initialize() {
- parse_once.Do(parse_data)
- }
- func NameForCodePoint(cp rune) string {
- Initialize()
- return names[cp]
- }
- func find_matching_codepoints(prefix string) (ans mark_set) {
- for q, marks := range word_map {
- if strings.HasPrefix(q, prefix) {
- if ans == nil {
- ans = utils.NewSet[uint16](len(marks) * 2)
- }
- ans.AddItems(marks...)
- }
- }
- return ans
- }
- func marks_for_query(query string) (ans mark_set) {
- Initialize()
- prefixes := strings.Split(strings.ToLower(query), " ")
- results := make(chan mark_set, len(prefixes))
- ctx := images.Context{}
- ctx.Parallel(0, len(prefixes), func(nums <-chan int) {
- for i := range nums {
- results <- find_matching_codepoints(prefixes[i])
- }
- })
- close(results)
- for x := range results {
- if ans == nil {
- ans = x
- } else {
- ans = ans.Intersect(x)
- }
- }
- if ans == nil {
- ans = utils.NewSet[uint16](0)
- }
- return
- }
- func CodePointsForQuery(query string) (ans []rune) {
- x := marks_for_query(query)
- ans = make([]rune, x.Len())
- i := 0
- for m := range x.Iterable() {
- ans[i] = marks[m]
- i += 1
- }
- return
- }
- func Develop() {
- start := time.Now()
- Initialize()
- fmt.Println("Parsing unicode name data took:", time.Since(start))
- start = time.Now()
- num := CodePointsForQuery("arr")
- fmt.Println("Querying arr took:", time.Since(start), "and found:", len(num))
- start = time.Now()
- num = CodePointsForQuery("arr right")
- fmt.Println("Querying arr right took:", time.Since(start), "and found:", len(num))
- }
|