unidecode.nim 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module is based on Python's [Unidecode](https://pypi.org/project/Unidecode/)
  10. ## module by Tomaz Solc, which in turn is based on the
  11. ## [Text::Unidecode](https://metacpan.org/pod/Text::Unidecode)
  12. ## Perl module by Sean M. Burke.
  13. ##
  14. ## It provides a `unidecode proc <#unidecode,string>`_ that does
  15. ## Unicode to ASCII transliterations: It finds the sequence of ASCII characters
  16. ## that is the closest approximation to the Unicode string.
  17. ##
  18. ## For example, the closest to string "Äußerst" in ASCII is "Ausserst". Some
  19. ## information is lost in this transformation, of course, since several Unicode
  20. ## strings can be transformed to the same ASCII representation. So this is a
  21. ## strictly one-way transformation. However, a human reader will probably
  22. ## still be able to guess from the context, what the original string was.
  23. ##
  24. ## This module needs the data file `unidecode.dat` to work: This file is
  25. ## embedded as a resource into your application by default. You can also
  26. ## define the symbol `--define:noUnidecodeTable` during compile time and
  27. ## use the `loadUnidecodeTable proc <#loadUnidecodeTable>`_ to initialize
  28. ## this module.
  29. import std/unicode
  30. when not defined(noUnidecodeTable):
  31. import std/strutils
  32. const translationTable = splitLines(slurp"unidecode/unidecode.dat")
  33. else:
  34. # shared is fine for threading:
  35. var translationTable: seq[string]
  36. proc loadUnidecodeTable*(datafile = "unidecode.dat") =
  37. ## Loads the datafile that `unidecode <#unidecode,string>`_ needs to work.
  38. ## This is only required if the module was compiled with the
  39. ## `--define:noUnidecodeTable` switch. This needs to be called by the
  40. ## main thread before any thread can make a call to `unidecode`.
  41. when defined(noUnidecodeTable):
  42. newSeq(translationTable, 0xffff)
  43. var i = 0
  44. for line in lines(datafile):
  45. translationTable[i] = line
  46. inc(i)
  47. proc unidecode*(s: string): string =
  48. ## Finds the sequence of ASCII characters that is the closest approximation
  49. ## to the UTF-8 string `s`.
  50. runnableExamples:
  51. doAssert unidecode("北京") == "Bei Jing "
  52. doAssert unidecode("Äußerst") == "Ausserst"
  53. result = ""
  54. for r in runes(s):
  55. var c = int(r)
  56. if c <=% 127: add(result, chr(c))
  57. elif c <% translationTable.len: add(result, translationTable[c - 128])