unidecode.nim 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module is based on Python's Unidecode module by Tomaz Solc,
  10. ## which in turn is based on the ``Text::Unidecode`` Perl module by
  11. ## Sean M. Burke
  12. ## (http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm ).
  13. ##
  14. ## It provides a single proc that does Unicode to ASCII transliterations:
  15. ## It finds the sequence of ASCII characters that is the closest approximation
  16. ## to the Unicode string.
  17. ##
  18. ## For example, the closest to string "Äußerst" in ASCII is "Ausserst". Some
  19. ## information is lost in this transformation, of course, since several Unicode
  20. ## strings can be transformed in the same ASCII representation. So this is a
  21. ## strictly one-way transformation. However a human reader will probably
  22. ## still be able to guess what original string was meant from the context.
  23. ##
  24. ## This module needs the data file "unidecode.dat" to work: This file is
  25. ## embedded as a resource into your application by default. But you an also
  26. ## define the symbol ``--define:noUnidecodeTable`` during compile time and
  27. ## use the `loadUnidecodeTable` proc to initialize this module.
  28. import unicode
  29. when not defined(noUnidecodeTable):
  30. import strutils
  31. const translationTable = splitLines(slurp"unidecode/unidecode.dat")
  32. else:
  33. # shared is fine for threading:
  34. var translationTable: seq[string]
  35. proc loadUnidecodeTable*(datafile = "unidecode.dat") =
  36. ## loads the datafile that `unidecode` to work. This is only required if
  37. ## the module was compiled with the ``--define:noUnidecodeTable`` switch.
  38. ## This needs to be called by the main thread before any thread can make a
  39. ## call to `unidecode`.
  40. when defined(noUnidecodeTable):
  41. newSeq(translationTable, 0xffff)
  42. var i = 0
  43. for line in lines(datafile):
  44. translationTable[i] = line.string
  45. inc(i)
  46. proc unidecode*(s: string): string =
  47. ## Finds the sequence of ASCII characters that is the closest approximation
  48. ## to the UTF-8 string `s`.
  49. ##
  50. ## Example:
  51. ##
  52. ## ..code-block:: nim
  53. ##
  54. ## unidecode("\x53\x17\x4E\xB0")
  55. ##
  56. ## Results in: "Bei Jing"
  57. ##
  58. result = ""
  59. for r in runes(s):
  60. var c = int(r)
  61. if c <=% 127: add(result, chr(c))
  62. elif c <% translationTable.len: add(result, translationTable[c-128])
  63. when isMainModule:
  64. #loadUnidecodeTable("lib/pure/unidecode/unidecode.dat")
  65. doAssert unidecode("Äußerst") == "Ausserst"
  66. doAssert unidecode("北京") == "Bei Jing "