twchartoutf8.nim 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. discard """
  2. matrix: "--mm:refc; --mm:orc"
  3. output: '''OK'''
  4. """
  5. import std/[syncio, assertions]
  6. #assume WideCharToMultiByte always produce correct result
  7. #windows only
  8. when not defined(windows):
  9. echo "OK"
  10. else:
  11. import std/widestrs
  12. {.push gcsafe.}
  13. const CP_UTF8 = 65001'i32
  14. type
  15. LPBOOL = ptr int32
  16. LPWCSTR = ptr uint16
  17. proc WideCharToMultiByte*(CodePage: int32, dwFlags: int32,
  18. lpWideCharStr: LPWCSTR, cchWideChar: int32,
  19. lpMultiByteStr: cstring, cchMultiByte: int32,
  20. lpDefaultChar: cstring, lpUsedDefaultChar: LPBOOL): int32{.
  21. stdcall, dynlib: "kernel32", importc: "WideCharToMultiByte".}
  22. {.pop.}
  23. proc convertToUTF8(wc: WideCString, wclen: int32): string =
  24. let size = WideCharToMultiByte(CP_UTF8, 0'i32, cast[LPWCSTR](addr(wc[0])), wclen,
  25. cstring(nil), 0'i32, cstring(nil), LPBOOL(nil))
  26. result = newString(size)
  27. let res = WideCharToMultiByte(CP_UTF8, 0'i32, cast[LPWCSTR](addr(wc[0])), wclen,
  28. cstring(result), size, cstring(nil), LPBOOL(nil))
  29. doAssert size == res
  30. proc testCP(wc: WideCString, lo, hi: int) =
  31. var x = 0
  32. let chunk = 1024
  33. for i in lo..hi:
  34. wc[x] = cast[Utf16Char](i)
  35. if (x >= chunk) or (i >= hi):
  36. wc[x] = Utf16Char(0)
  37. var a = convertToUTF8(wc, int32(x))
  38. var b = wc $ chunk
  39. doAssert a == b
  40. x = 0
  41. inc x
  42. proc testCP2(wc: WideCString, lo, hi: int) =
  43. doAssert((lo >= 0x10000) and (hi <= 0x10FFFF))
  44. var x = 0
  45. let chunk = 1024
  46. for i in lo..hi:
  47. let ch = i - 0x10000
  48. let W1 = 0xD800 or (ch shr 10)
  49. let W2 = 0xDC00 or (0x3FF and ch)
  50. wc[x] = cast[Utf16Char](W1)
  51. wc[x+1] = cast[Utf16Char](W2)
  52. inc(x, 2)
  53. if (x >= chunk) or (i >= hi):
  54. wc[x] = Utf16Char(0)
  55. var a = convertToUTF8(wc, int32(x))
  56. var b = wc $ chunk
  57. doAssert a == b
  58. x = 0
  59. #RFC-2781 "UTF-16, an encoding of ISO 10646"
  60. var wc: WideCString = newWideCString(1024 * 2)
  61. #U+0000 to U+D7FF
  62. #skip the U+0000
  63. wc.testCP(1, 0xD7FF)
  64. #U+E000 to U+FFFF
  65. wc.testCP(0xE000, 0xFFFF)
  66. #U+10000 to U+10FFFF
  67. wc.testCP2(0x10000, 0x10FFFF)
  68. #invalid UTF-16
  69. const
  70. b = "\xEF\xBF\xBD"
  71. c = "\xEF\xBF\xBF"
  72. wc[0] = cast[Utf16Char](0xDC00)
  73. wc[1] = Utf16Char(0)
  74. var a = $wc
  75. doAssert a == b
  76. wc[0] = cast[Utf16Char](0xFFFF)
  77. wc[1] = cast[Utf16Char](0xDC00)
  78. wc[2] = Utf16Char(0)
  79. a = $wc
  80. doAssert a == c & b
  81. wc[0] = cast[Utf16Char](0xD800)
  82. wc[1] = Utf16Char(0)
  83. a = $wc
  84. doAssert a == b
  85. wc[0] = cast[Utf16Char](0xD800)
  86. wc[1] = cast[Utf16Char](0xFFFF)
  87. wc[2] = Utf16Char(0)
  88. a = $wc
  89. doAssert a == b & c
  90. echo "OK"