1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465 |
- #
- #
- # Nim's Runtime Library
- # (c) Copyright 2012 Andreas Rumpf
- #
- # See the file "copying.txt", included in this
- # distribution, for details about the copyright.
- #
- ## This module is based on Python's [Unidecode](https://pypi.org/project/Unidecode/)
- ## module by Tomaz Solc, which in turn is based on the
- ## [Text::Unidecode](https://metacpan.org/pod/Text::Unidecode)
- ## Perl module by Sean M. Burke.
- ##
- ## It provides a `unidecode proc <#unidecode,string>`_ that does
- ## Unicode to ASCII transliterations: It finds the sequence of ASCII characters
- ## that is the closest approximation to the Unicode string.
- ##
- ## For example, the closest to string "Äußerst" in ASCII is "Ausserst". Some
- ## information is lost in this transformation, of course, since several Unicode
- ## strings can be transformed to the same ASCII representation. So this is a
- ## strictly one-way transformation. However, a human reader will probably
- ## still be able to guess from the context, what the original string was.
- ##
- ## This module needs the data file `unidecode.dat` to work: This file is
- ## embedded as a resource into your application by default. You can also
- ## define the symbol `--define:noUnidecodeTable` during compile time and
- ## use the `loadUnidecodeTable proc <#loadUnidecodeTable>`_ to initialize
- ## this module.
- import std/unicode
- when not defined(noUnidecodeTable):
- import std/strutils
- const translationTable = splitLines(slurp"unidecode/unidecode.dat")
- else:
- # shared is fine for threading:
- var translationTable: seq[string]
- proc loadUnidecodeTable*(datafile = "unidecode.dat") =
- ## Loads the datafile that `unidecode <#unidecode,string>`_ needs to work.
- ## This is only required if the module was compiled with the
- ## `--define:noUnidecodeTable` switch. This needs to be called by the
- ## main thread before any thread can make a call to `unidecode`.
- when defined(noUnidecodeTable):
- newSeq(translationTable, 0xffff)
- var i = 0
- for line in lines(datafile):
- translationTable[i] = line
- inc(i)
- proc unidecode*(s: string): string =
- ## Finds the sequence of ASCII characters that is the closest approximation
- ## to the UTF-8 string `s`.
- runnableExamples:
- doAssert unidecode("北京") == "Bei Jing "
- doAssert unidecode("Äußerst") == "Ausserst"
- result = ""
- for r in runes(s):
- var c = int(r)
- if c <=% 127: add(result, chr(c))
- elif c <% translationTable.len: add(result, translationTable[c - 128])
|