encodings.nim 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2015 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## Routines for converting between different character encodings. On UNIX, this uses
  10. ## the `iconv`:idx: library, on Windows the Windows API.
  11. ##
  12. ## The following example shows how to change character encodings.
  13. runnableExamples:
  14. when defined(windows):
  15. let
  16. orig = "öäüß"
  17. # convert `orig` from "UTF-8" to "CP1252"
  18. cp1252 = convert(orig, "CP1252", "UTF-8")
  19. # convert `cp1252` from "CP1252" to "ibm850"
  20. ibm850 = convert(cp1252, "ibm850", "CP1252")
  21. current = getCurrentEncoding()
  22. assert orig == "\195\182\195\164\195\188\195\159"
  23. assert ibm850 == "\148\132\129\225"
  24. assert convert(ibm850, current, "ibm850") == orig
  25. ## The example below uses a reuseable `EncodingConverter` object which is
  26. ## created by `open` with `destEncoding` and `srcEncoding` specified. You can use
  27. ## `convert` on this object multiple times.
  28. runnableExamples:
  29. when defined(windows):
  30. var fromGB2312 = open("utf-8", "gb2312")
  31. let first = "\203\173\197\194\163\191\210\187" &
  32. "\203\242\209\204\211\234\200\206\198\189\201\250"
  33. assert fromGB2312.convert(first) == "谁怕?一蓑烟雨任平生"
  34. let second = "\211\208\176\215\205\183\200\231" &
  35. "\208\194\163\172\199\227\184\199\200\231\185\202"
  36. assert fromGB2312.convert(second) == "有白头如新,倾盖如故"
  37. import std/os
  38. when defined(nimPreviewSlimSystem):
  39. import std/assertions
  40. when not defined(windows):
  41. type
  42. ConverterObj = object
  43. EncodingConverter* = ptr ConverterObj ## Can convert between two character sets.
  44. else:
  45. type
  46. CodePage = distinct int32
  47. EncodingConverter* = object
  48. dest, src: CodePage
  49. type
  50. EncodingError* = object of ValueError ## Exception that is raised
  51. ## for encoding errors.
  52. when defined(windows):
  53. import std/[parseutils, strutils]
  54. proc eqEncodingNames(a, b: string): bool =
  55. var i = 0
  56. var j = 0
  57. while i < a.len and j < b.len:
  58. if a[i] in {'-', '_'}: inc i
  59. if b[j] in {'-', '_'}: inc j
  60. if i < a.len and j < b.len and
  61. a[i].toLowerAscii != b[j].toLowerAscii:
  62. return false
  63. inc i
  64. inc j
  65. result = i == a.len and j == b.len
  66. const
  67. winEncodings = [
  68. (1, "OEMCP"), # current OEM codepage
  69. (037, "IBM037"), # IBM EBCDIC US-Canada
  70. (437, "IBM437"), # OEM United States
  71. (500, "IBM500"), # IBM EBCDIC International
  72. (708, "ASMO-708"), # Arabic (ASMO 708)
  73. (709, "ASMO_449"), # Arabic (ASMO-449+, BCON V4)
  74. (710, ""), # Arabic - Transparent Arabic
  75. (720, "DOS-720"), # Arabic (Transparent ASMO); Arabic (DOS)
  76. (737, "ibm737"), # OEM Greek (formerly 437G); Greek (DOS)
  77. (775, "ibm775"), # OEM Baltic; Baltic (DOS)
  78. (850, "ibm850"), # OEM Multilingual Latin 1; Western European (DOS)
  79. (852, "ibm852"), # OEM Latin 2; Central European (DOS)
  80. (855, "IBM855"), # OEM Cyrillic (primarily Russian)
  81. (857, "ibm857"), # OEM Turkish; Turkish (DOS)
  82. (858, "IBM00858"), # OEM Multilingual Latin 1 + Euro symbol
  83. (860, "IBM860"), # OEM Portuguese; Portuguese (DOS)
  84. (861, "ibm861"), # OEM Icelandic; Icelandic (DOS)
  85. (862, "DOS-862"), # OEM Hebrew; Hebrew (DOS)
  86. (863, "IBM863"), # OEM French Canadian; French Canadian (DOS)
  87. (864, "IBM864"), # OEM Arabic; Arabic (864)
  88. (865, "IBM865"), # OEM Nordic; Nordic (DOS)
  89. (866, "cp866"), # OEM Russian; Cyrillic (DOS)
  90. (869, "ibm869"), # OEM Modern Greek; Greek, Modern (DOS)
  91. (870, "IBM870"), # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
  92. (874, "windows-874"), # ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows)
  93. (875, "cp875"), # IBM EBCDIC Greek Modern
  94. (932, "shift_jis"), # ANSI/OEM Japanese; Japanese (Shift-JIS)
  95. (936, "gb2312"), # ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
  96. (936, "gbk"), # Alias for GB2312 encoding
  97. (949, "ks_c_5601-1987"), # ANSI/OEM Korean (Unified Hangul Code)
  98. (950, "big5"), # ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
  99. (1026, "IBM1026"), # IBM EBCDIC Turkish (Latin 5)
  100. (1047, "IBM01047"), # IBM EBCDIC Latin 1/Open System
  101. (1140, "IBM01140"), # IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
  102. (1141, "IBM01141"), # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
  103. (1142, "IBM01142"), # IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
  104. (1143, "IBM01143"), # IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
  105. (1144, "IBM01144"), # IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
  106. (1145, "IBM01145"), # IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
  107. (1146, "IBM01146"), # IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
  108. (1147, "IBM01147"), # IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
  109. (1148, "IBM01148"), # IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
  110. (1149, "IBM01149"), # IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
  111. (1200, "utf-16"), # Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
  112. (1201, "unicodeFFFE"), # Unicode UTF-16, big endian byte order; available only to managed applications
  113. (1250, "windows-1250"), # ANSI Central European; Central European (Windows)
  114. (1251, "windows-1251"), # ANSI Cyrillic; Cyrillic (Windows)
  115. (1252, "windows-1252"), # ANSI Latin 1; Western European (Windows)
  116. (1253, "windows-1253"), # ANSI Greek; Greek (Windows)
  117. (1254, "windows-1254"), # ANSI Turkish; Turkish (Windows)
  118. (1255, "windows-1255"), # ANSI Hebrew; Hebrew (Windows)
  119. (1256, "windows-1256"), # ANSI Arabic; Arabic (Windows)
  120. (1257, "windows-1257"), # ANSI Baltic; Baltic (Windows)
  121. (1258, "windows-1258"), # ANSI/OEM Vietnamese; Vietnamese (Windows)
  122. (1250, "cp-1250"), # ANSI Central European; Central European (Windows)
  123. (1251, "cp-1251"), # ANSI Cyrillic; Cyrillic (Windows)
  124. (1252, "cp-1252"), # ANSI Latin 1; Western European (Windows)
  125. (1253, "cp-1253"), # ANSI Greek; Greek (Windows)
  126. (1254, "cp-1254"), # ANSI Turkish; Turkish (Windows)
  127. (1255, "cp-1255"), # ANSI Hebrew; Hebrew (Windows)
  128. (1256, "cp-1256"), # ANSI Arabic; Arabic (Windows)
  129. (1257, "cp-1257"), # ANSI Baltic; Baltic (Windows)
  130. (1258, "cp-1258"), # ANSI/OEM Vietnamese; Vietnamese (Windows)
  131. (1361, "Johab"), # Korean (Johab)
  132. (10000, "macintosh"), # MAC Roman; Western European (Mac)
  133. (10001, "x-mac-japanese"), # Japanese (Mac)
  134. (10002, "x-mac-chinesetrad"), # MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
  135. (10003, "x-mac-korean"), # Korean (Mac)
  136. (10004, "x-mac-arabic"), # Arabic (Mac)
  137. (10005, "x-mac-hebrew"), # Hebrew (Mac)
  138. (10006, "x-mac-greek"), # Greek (Mac)
  139. (10007, "x-mac-cyrillic"), # Cyrillic (Mac)
  140. (10008, "x-mac-chinesesimp"), # MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
  141. (10010, "x-mac-romanian"), # Romanian (Mac)
  142. (10017, "x-mac-ukrainian"), # Ukrainian (Mac)
  143. (10021, "x-mac-thai"), # Thai (Mac)
  144. (10029, "x-mac-ce"), # MAC Latin 2; Central European (Mac)
  145. (10079, "x-mac-icelandic"), # Icelandic (Mac)
  146. (10081, "x-mac-turkish"), # Turkish (Mac)
  147. (10082, "x-mac-croatian"), # Croatian (Mac)
  148. (12000, "utf-32"), # Unicode UTF-32, little endian byte order; available only to managed applications
  149. (12001, "utf-32BE"), # Unicode UTF-32, big endian byte order; available only to managed applications
  150. (20000, "x-Chinese_CNS"), # CNS Taiwan; Chinese Traditional (CNS)
  151. (20001, "x-cp20001"), # TCA Taiwan
  152. (20002, "x_Chinese-Eten"), # Eten Taiwan; Chinese Traditional (Eten)
  153. (20003, "x-cp20003"), # IBM5550 Taiwan
  154. (20004, "x-cp20004"), # TeleText Taiwan
  155. (20005, "x-cp20005"), # Wang Taiwan
  156. (20105, "x-IA5"), # IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
  157. (20106, "x-IA5-German"), # IA5 German (7-bit)
  158. (20107, "x-IA5-Swedish"), # IA5 Swedish (7-bit)
  159. (20108, "x-IA5-Norwegian"), # IA5 Norwegian (7-bit)
  160. (20127, "us-ascii"), # US-ASCII (7-bit)
  161. (20261, "x-cp20261"), # T.61
  162. (20269, "x-cp20269"), # ISO 6937 Non-Spacing Accent
  163. (20273, "IBM273"), # IBM EBCDIC Germany
  164. (20277, "IBM277"), # IBM EBCDIC Denmark-Norway
  165. (20278, "IBM278"), # IBM EBCDIC Finland-Sweden
  166. (20280, "IBM280"), # IBM EBCDIC Italy
  167. (20284, "IBM284"), # IBM EBCDIC Latin America-Spain
  168. (20285, "IBM285"), # IBM EBCDIC United Kingdom
  169. (20290, "IBM290"), # IBM EBCDIC Japanese Katakana Extended
  170. (20297, "IBM297"), # IBM EBCDIC France
  171. (20420, "IBM420"), # IBM EBCDIC Arabic
  172. (20423, "IBM423"), # IBM EBCDIC Greek
  173. (20424, "IBM424"), # IBM EBCDIC Hebrew
  174. (20833, "x-EBCDIC-KoreanExtended"), # IBM EBCDIC Korean Extended
  175. (20838, "IBM-Thai"), # IBM EBCDIC Thai
  176. (20866, "koi8-r"), # Russian (KOI8-R); Cyrillic (KOI8-R)
  177. (20871, "IBM871"), # IBM EBCDIC Icelandic
  178. (20880, "IBM880"), # IBM EBCDIC Cyrillic Russian
  179. (20905, "IBM905"), # IBM EBCDIC Turkish
  180. (20924, "IBM00924"), # IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
  181. (20932, "EUC-JP"), # Japanese (JIS 0208-1990 and 0121-1990)
  182. (20936, "x-cp20936"), # Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
  183. (20949, "x-cp20949"), # Korean Wansung
  184. (21025, "cp1025"), # IBM EBCDIC Cyrillic Serbian-Bulgarian
  185. (21027, ""), # (deprecated)
  186. (21866, "koi8-u"), # Ukrainian (KOI8-U); Cyrillic (KOI8-U)
  187. (28591, "iso-8859-1"), # ISO 8859-1 Latin 1; Western European (ISO)
  188. (28592, "iso-8859-2"), # ISO 8859-2 Central European; Central European (ISO)
  189. (28593, "iso-8859-3"), # ISO 8859-3 Latin 3
  190. (28594, "iso-8859-4"), # ISO 8859-4 Baltic
  191. (28595, "iso-8859-5"), # ISO 8859-5 Cyrillic
  192. (28596, "iso-8859-6"), # ISO 8859-6 Arabic
  193. (28597, "iso-8859-7"), # ISO 8859-7 Greek
  194. (28598, "iso-8859-8"), # ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
  195. (28599, "iso-8859-9"), # ISO 8859-9 Turkish
  196. (28603, "iso-8859-13"), # ISO 8859-13 Estonian
  197. (28605, "iso-8859-15"), # ISO 8859-15 Latin 9
  198. (29001, "x-Europa"), # Europa 3
  199. (38598, "iso-8859-8-i"), # ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
  200. (50220, "iso-2022-jp"), # ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
  201. (50221, "csISO2022JP"), # ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
  202. (50222, "iso-2022-jp"), # ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
  203. (50225, "iso-2022-kr"), # ISO 2022 Korean
  204. (50227, "x-cp50227"), # ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
  205. (50229, ""), # ISO 2022 Traditional Chinese
  206. (50930, ""), # EBCDIC Japanese (Katakana) Extended
  207. (50931, ""), # EBCDIC US-Canada and Japanese
  208. (50933, ""), # EBCDIC Korean Extended and Korean
  209. (50935, ""), # EBCDIC Simplified Chinese Extended and Simplified Chinese
  210. (50936, ""), # EBCDIC Simplified Chinese
  211. (50937, ""), # EBCDIC US-Canada and Traditional Chinese
  212. (50939, ""), # EBCDIC Japanese (Latin) Extended and Japanese
  213. (51932, "euc-jp"), # EUC Japanese
  214. (51936, "EUC-CN"), # EUC Simplified Chinese; Chinese Simplified (EUC)
  215. (51949, "euc-kr"), # EUC Korean
  216. (51950, ""), # EUC Traditional Chinese
  217. (52936, "hz-gb-2312"), # HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
  218. (54936, "GB18030"), # Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
  219. (57002, "x-iscii-de"), # ISCII Devanagari
  220. (57003, "x-iscii-be"), # ISCII Bengali
  221. (57004, "x-iscii-ta"), # ISCII Tamil
  222. (57005, "x-iscii-te"), # ISCII Telugu
  223. (57006, "x-iscii-as"), # ISCII Assamese
  224. (57007, "x-iscii-or"), # ISCII Oriya
  225. (57008, "x-iscii-ka"), # ISCII Kannada
  226. (57009, "x-iscii-ma"), # ISCII Malayalam
  227. (57010, "x-iscii-gu"), # ISCII Gujarati
  228. (57011, "x-iscii-pa"), # ISCII Punjabi
  229. (65000, "utf-7"), # Unicode (UTF-7)
  230. (65001, "utf-8")] # Unicode (UTF-8)
  231. when false:
  232. # not needed yet:
  233. type
  234. CpInfo = object
  235. maxCharSize: int32
  236. defaultChar: array[0..1, char]
  237. leadByte: array[0..12-1, char]
  238. proc getCPInfo(codePage: CodePage, lpCPInfo: var CpInfo): int32 {.
  239. stdcall, importc: "GetCPInfo", dynlib: "kernel32".}
  240. proc nameToCodePage*(name: string): CodePage =
  241. var nameAsInt: int
  242. if parseInt(name, nameAsInt) == 0: nameAsInt = -1
  243. for no, na in items(winEncodings):
  244. if no == nameAsInt or eqEncodingNames(na, name): return CodePage(no)
  245. result = CodePage(-1)
  246. proc codePageToName*(c: CodePage): string =
  247. for no, na in items(winEncodings):
  248. if no == int(c):
  249. return if na.len != 0: na else: $no
  250. result = ""
  251. proc getACP(): CodePage {.stdcall, importc: "GetACP", dynlib: "kernel32".}
  252. proc getGetConsoleCP(): CodePage {.stdcall, importc: "GetConsoleCP",
  253. dynlib: "kernel32".}
  254. proc multiByteToWideChar(
  255. codePage: CodePage,
  256. dwFlags: int32,
  257. lpMultiByteStr: cstring,
  258. cbMultiByte: cint,
  259. lpWideCharStr: cstring,
  260. cchWideChar: cint): cint {.
  261. stdcall, importc: "MultiByteToWideChar", dynlib: "kernel32".}
  262. proc wideCharToMultiByte(
  263. codePage: CodePage,
  264. dwFlags: int32,
  265. lpWideCharStr: cstring,
  266. cchWideChar: cint,
  267. lpMultiByteStr: cstring,
  268. cbMultiByte: cint,
  269. lpDefaultChar: cstring = nil,
  270. lpUsedDefaultChar: pointer = nil): cint {.
  271. stdcall, importc: "WideCharToMultiByte", dynlib: "kernel32".}
  272. else:
  273. when defined(haiku):
  274. const iconvDll = "libiconv.so"
  275. elif defined(macosx):
  276. const iconvDll = "libiconv.dylib"
  277. else:
  278. const iconvDll = "(libc.so.6|libiconv.so)"
  279. const
  280. E2BIG = 7.cint
  281. EINVAL = 22.cint
  282. when defined(linux):
  283. const EILSEQ = 84.cint
  284. elif defined(macosx):
  285. const EILSEQ = 92.cint
  286. elif defined(bsd):
  287. const EILSEQ = 86.cint
  288. elif defined(solaris):
  289. const EILSEQ = 88.cint
  290. elif defined(haiku):
  291. const EILSEQ = -2147454938.cint
  292. var errno {.importc, header: "<errno.h>".}: cint
  293. when defined(bsd):
  294. {.pragma: importIconv, cdecl, header: "<iconv.h>".}
  295. when defined(openbsd):
  296. {.passL: "-liconv".}
  297. else:
  298. {.pragma: importIconv, cdecl, dynlib: iconvDll.}
  299. proc iconvOpen(tocode, fromcode: cstring): EncodingConverter {.
  300. importc: "iconv_open", importIconv.}
  301. proc iconvClose(c: EncodingConverter) {.
  302. importc: "iconv_close", importIconv.}
  303. proc iconv(c: EncodingConverter, inbuf: ptr cstring, inbytesLeft: ptr csize_t,
  304. outbuf: ptr cstring, outbytesLeft: ptr csize_t): csize_t {.
  305. importc: "iconv", importIconv.}
  306. proc getCurrentEncoding*(uiApp = false): string =
  307. ## Retrieves the current encoding. On Unix, "UTF-8" is always returned.
  308. ## The `uiApp` parameter is Windows specific. If true, the UI's code-page
  309. ## is returned, if false, the Console's code-page is returned.
  310. when defined(windows):
  311. result = codePageToName(if uiApp: getACP() else: getGetConsoleCP())
  312. else:
  313. result = "UTF-8"
  314. proc open*(destEncoding = "UTF-8", srcEncoding = "CP1252"): EncodingConverter =
  315. ## Opens a converter that can convert from `srcEncoding` to `destEncoding`.
  316. ## Raises `EncodingError` if it cannot fulfill the request.
  317. when not defined(windows):
  318. result = iconvOpen(destEncoding, srcEncoding)
  319. if result == cast[EncodingConverter](-1):
  320. raise newException(EncodingError,
  321. "cannot create encoding converter from " &
  322. srcEncoding & " to " & destEncoding)
  323. else:
  324. result.dest = nameToCodePage(destEncoding)
  325. result.src = nameToCodePage(srcEncoding)
  326. if int(result.dest) == -1:
  327. raise newException(EncodingError,
  328. "cannot find encoding " & destEncoding)
  329. if int(result.src) == -1:
  330. raise newException(EncodingError,
  331. "cannot find encoding " & srcEncoding)
  332. proc close*(c: EncodingConverter) =
  333. ## Frees the resources the converter `c` holds.
  334. when not defined(windows):
  335. iconvClose(c)
  336. when defined(windows):
  337. proc convertToWideString(codePage: CodePage, s: string): string =
  338. # educated guess of capacity:
  339. var cap = s.len + s.len shr 2
  340. result = newString(cap*2)
  341. # convert to utf-16 LE
  342. var m = multiByteToWideChar(codePage,
  343. dwFlags = 0'i32,
  344. lpMultiByteStr = cstring(s),
  345. cbMultiByte = cint(s.len),
  346. lpWideCharStr = cstring(result),
  347. cchWideChar = cint(cap))
  348. if m == 0:
  349. # try again; ask for capacity:
  350. cap = multiByteToWideChar(codePage,
  351. dwFlags = 0'i32,
  352. lpMultiByteStr = cstring(s),
  353. cbMultiByte = cint(s.len),
  354. lpWideCharStr = nil,
  355. cchWideChar = cint(0))
  356. # and do the conversion properly:
  357. result = newString(cap*2)
  358. m = multiByteToWideChar(codePage,
  359. dwFlags = 0'i32,
  360. lpMultiByteStr = cstring(s),
  361. cbMultiByte = cint(s.len),
  362. lpWideCharStr = cstring(result),
  363. cchWideChar = cint(cap))
  364. if m == 0: raiseOSError(osLastError())
  365. setLen(result, m*2)
  366. elif m <= cap:
  367. setLen(result, m*2)
  368. else:
  369. assert(false) # cannot happen
  370. proc convertFromWideString(codePage: CodePage, s: string): string =
  371. let charCount = s.len div 2
  372. var cap = s.len + s.len shr 2
  373. result = newString(cap)
  374. var m = wideCharToMultiByte(codePage,
  375. dwFlags = 0'i32,
  376. lpWideCharStr = cstring(s),
  377. cchWideChar = cint(charCount),
  378. lpMultiByteStr = cstring(result),
  379. cbMultiByte = cap.cint)
  380. if m == 0:
  381. # try again; ask for capacity:
  382. cap = wideCharToMultiByte(codePage,
  383. dwFlags = 0'i32,
  384. lpWideCharStr = cstring(s),
  385. cchWideChar = cint(charCount),
  386. lpMultiByteStr = nil,
  387. cbMultiByte = cint(0))
  388. # and do the conversion properly:
  389. result = newString(cap)
  390. m = wideCharToMultiByte(codePage,
  391. dwFlags = 0'i32,
  392. lpWideCharStr = cstring(s),
  393. cchWideChar = cint(charCount),
  394. lpMultiByteStr = cstring(result),
  395. cbMultiByte = cap.cint)
  396. if m == 0: raiseOSError(osLastError())
  397. setLen(result, m)
  398. elif m <= cap:
  399. setLen(result, m)
  400. else:
  401. assert(false) # cannot happen
  402. proc convertWin(codePageFrom: CodePage, codePageTo: CodePage,
  403. s: string): string =
  404. # special case: empty string: needed because MultiByteToWideChar, WideCharToMultiByte
  405. # return 0 in case of error
  406. if s.len == 0: return ""
  407. # multiByteToWideChar does not support encoding from code pages below
  408. let unsupported = [1201, 12000, 12001]
  409. if int(codePageFrom) in unsupported:
  410. let message = "encoding from " & codePageToName(codePageFrom) & " is not supported on windows"
  411. raise newException(EncodingError, message)
  412. if int(codePageTo) in unsupported:
  413. let message = "encoding to " & codePageToName(codePageTo) & " is not supported on windows"
  414. raise newException(EncodingError, message)
  415. # in case it's already UTF-16 little endian - conversion can be simplified
  416. let wideString = if int(codePageFrom) == 1200: s
  417. else: convertToWideString(codePageFrom, s)
  418. return if int(codePageTo) == 1200: wideString
  419. else: convertFromWideString(codePageTo, wideString)
  420. proc convert*(c: EncodingConverter, s: string): string =
  421. result = convertWin(c.src, c.dest, s)
  422. else:
  423. proc convert*(c: EncodingConverter, s: string): string =
  424. ## Converts `s` to `destEncoding` that was given to the converter `c`. It
  425. ## assumes that `s` is in `srcEncoding`.
  426. ##
  427. ## .. warning:: UTF-16BE and UTF-32 conversions are not supported on Windows.
  428. result = newString(s.len)
  429. var inLen = csize_t len(s)
  430. var outLen = csize_t len(result)
  431. var src = cstring(s)
  432. var dst = cstring(result)
  433. var iconvres: csize_t
  434. while inLen > 0:
  435. iconvres = iconv(c, addr src, addr inLen, addr dst, addr outLen)
  436. if iconvres == high(csize_t):
  437. var lerr = errno
  438. if lerr == EILSEQ or lerr == EINVAL:
  439. # unknown char, skip
  440. dst[0] = src[0]
  441. src = cast[cstring](cast[int](src) + 1)
  442. dst = cast[cstring](cast[int](dst) + 1)
  443. dec(inLen)
  444. dec(outLen)
  445. elif lerr == E2BIG:
  446. var offset = cast[int](dst) - cast[int](cstring(result))
  447. setLen(result, len(result) + inLen.int * 2 + 5)
  448. # 5 is minimally one utf-8 char
  449. dst = cast[cstring](cast[int](cstring(result)) + offset)
  450. outLen = csize_t(len(result) - offset)
  451. else:
  452. raiseOSError(lerr.OSErrorCode)
  453. # iconv has a buffer that needs flushing, specially if the last char is
  454. # not '\0'
  455. discard iconv(c, nil, nil, addr dst, addr outLen)
  456. if iconvres == high(csize_t) and errno == E2BIG:
  457. var offset = cast[int](dst) - cast[int](cstring(result))
  458. setLen(result, len(result) + inLen.int * 2 + 5)
  459. # 5 is minimally one utf-8 char
  460. dst = cast[cstring](cast[int](cstring(result)) + offset)
  461. outLen = csize_t(len(result) - offset)
  462. discard iconv(c, nil, nil, addr dst, addr outLen)
  463. # trim output buffer
  464. setLen(result, len(result) - outLen.int)
  465. proc convert*(s: string, destEncoding = "UTF-8",
  466. srcEncoding = "CP1252"): string =
  467. ## Converts `s` to `destEncoding`. It assumed that `s` is in `srcEncoding`.
  468. ## This opens a converter, uses it and closes it again and is thus more
  469. ## convenient but also likely less efficient than re-using a converter.
  470. ##
  471. ## .. warning:: UTF-16BE and UTF-32 conversions are not supported on Windows.
  472. var c = open(destEncoding, srcEncoding)
  473. try:
  474. result = convert(c, s)
  475. finally:
  476. close(c)