encodings.nim 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2015 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## Routines for converting between different character encodings. On UNIX, this uses
  10. ## the `iconv`:idx: library, on Windows the Windows API.
  11. ##
  12. ## The following example shows how to change character encodings.
  13. runnableExamples:
  14. when defined(windows):
  15. let
  16. orig = "öäüß"
  17. # convert `orig` from "UTF-8" to "CP1252"
  18. cp1252 = convert(orig, "CP1252", "UTF-8")
  19. # convert `cp1252` from "CP1252" to "ibm850"
  20. ibm850 = convert(cp1252, "ibm850", "CP1252")
  21. current = getCurrentEncoding()
  22. assert orig == "\195\182\195\164\195\188\195\159"
  23. assert ibm850 == "\148\132\129\225"
  24. assert convert(ibm850, current, "ibm850") == orig
  25. ## The example below uses a reuseable `EncodingConverter` object which is
  26. ## created by `open` with `destEncoding` and `srcEncoding` specified. You can use
  27. ## `convert` on this object multiple times.
  28. runnableExamples:
  29. when defined(windows):
  30. var fromGB2312 = open("utf-8", "gb2312")
  31. let first = "\203\173\197\194\163\191\210\187" &
  32. "\203\242\209\204\211\234\200\206\198\189\201\250"
  33. assert fromGB2312.convert(first) == "谁怕?一蓑烟雨任平生"
  34. let second = "\211\208\176\215\205\183\200\231" &
  35. "\208\194\163\172\199\227\184\199\200\231\185\202"
  36. assert fromGB2312.convert(second) == "有白头如新,倾盖如故"
  37. import os
  38. when not defined(windows):
  39. type
  40. ConverterObj = object
  41. EncodingConverter* = ptr ConverterObj ## Can convert between two character sets.
  42. else:
  43. type
  44. CodePage = distinct int32
  45. EncodingConverter* = object
  46. dest, src: CodePage
  47. type
  48. EncodingError* = object of ValueError ## Exception that is raised
  49. ## for encoding errors.
  50. when defined(windows):
  51. import parseutils, strutils
  52. proc eqEncodingNames(a, b: string): bool =
  53. var i = 0
  54. var j = 0
  55. while i < a.len and j < b.len:
  56. if a[i] in {'-', '_'}: inc i
  57. if b[j] in {'-', '_'}: inc j
  58. if i < a.len and j < b.len and
  59. a[i].toLowerAscii != b[j].toLowerAscii:
  60. return false
  61. inc i
  62. inc j
  63. result = i == a.len and j == b.len
  64. const
  65. winEncodings = [
  66. (1, "OEMCP"), # current OEM codepage
  67. (037, "IBM037"), # IBM EBCDIC US-Canada
  68. (437, "IBM437"), # OEM United States
  69. (500, "IBM500"), # IBM EBCDIC International
  70. (708, "ASMO-708"), # Arabic (ASMO 708)
  71. (709, "ASMO_449"), # Arabic (ASMO-449+, BCON V4)
  72. (710, ""), # Arabic - Transparent Arabic
  73. (720, "DOS-720"), # Arabic (Transparent ASMO); Arabic (DOS)
  74. (737, "ibm737"), # OEM Greek (formerly 437G); Greek (DOS)
  75. (775, "ibm775"), # OEM Baltic; Baltic (DOS)
  76. (850, "ibm850"), # OEM Multilingual Latin 1; Western European (DOS)
  77. (852, "ibm852"), # OEM Latin 2; Central European (DOS)
  78. (855, "IBM855"), # OEM Cyrillic (primarily Russian)
  79. (857, "ibm857"), # OEM Turkish; Turkish (DOS)
  80. (858, "IBM00858"), # OEM Multilingual Latin 1 + Euro symbol
  81. (860, "IBM860"), # OEM Portuguese; Portuguese (DOS)
  82. (861, "ibm861"), # OEM Icelandic; Icelandic (DOS)
  83. (862, "DOS-862"), # OEM Hebrew; Hebrew (DOS)
  84. (863, "IBM863"), # OEM French Canadian; French Canadian (DOS)
  85. (864, "IBM864"), # OEM Arabic; Arabic (864)
  86. (865, "IBM865"), # OEM Nordic; Nordic (DOS)
  87. (866, "cp866"), # OEM Russian; Cyrillic (DOS)
  88. (869, "ibm869"), # OEM Modern Greek; Greek, Modern (DOS)
  89. (870, "IBM870"), # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
  90. (874, "windows-874"), # ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows)
  91. (875, "cp875"), # IBM EBCDIC Greek Modern
  92. (932, "shift_jis"), # ANSI/OEM Japanese; Japanese (Shift-JIS)
  93. (936, "gb2312"), # ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
  94. (936, "gbk"), # Alias for GB2312 encoding
  95. (949, "ks_c_5601-1987"), # ANSI/OEM Korean (Unified Hangul Code)
  96. (950, "big5"), # ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
  97. (1026, "IBM1026"), # IBM EBCDIC Turkish (Latin 5)
  98. (1047, "IBM01047"), # IBM EBCDIC Latin 1/Open System
  99. (1140, "IBM01140"), # IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
  100. (1141, "IBM01141"), # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
  101. (1142, "IBM01142"), # IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
  102. (1143, "IBM01143"), # IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
  103. (1144, "IBM01144"), # IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
  104. (1145, "IBM01145"), # IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
  105. (1146, "IBM01146"), # IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
  106. (1147, "IBM01147"), # IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
  107. (1148, "IBM01148"), # IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
  108. (1149, "IBM01149"), # IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
  109. (1200, "utf-16"), # Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
  110. (1201, "unicodeFFFE"), # Unicode UTF-16, big endian byte order; available only to managed applications
  111. (1250, "windows-1250"), # ANSI Central European; Central European (Windows)
  112. (1251, "windows-1251"), # ANSI Cyrillic; Cyrillic (Windows)
  113. (1252, "windows-1252"), # ANSI Latin 1; Western European (Windows)
  114. (1253, "windows-1253"), # ANSI Greek; Greek (Windows)
  115. (1254, "windows-1254"), # ANSI Turkish; Turkish (Windows)
  116. (1255, "windows-1255"), # ANSI Hebrew; Hebrew (Windows)
  117. (1256, "windows-1256"), # ANSI Arabic; Arabic (Windows)
  118. (1257, "windows-1257"), # ANSI Baltic; Baltic (Windows)
  119. (1258, "windows-1258"), # ANSI/OEM Vietnamese; Vietnamese (Windows)
  120. (1250, "cp-1250"), # ANSI Central European; Central European (Windows)
  121. (1251, "cp-1251"), # ANSI Cyrillic; Cyrillic (Windows)
  122. (1252, "cp-1252"), # ANSI Latin 1; Western European (Windows)
  123. (1253, "cp-1253"), # ANSI Greek; Greek (Windows)
  124. (1254, "cp-1254"), # ANSI Turkish; Turkish (Windows)
  125. (1255, "cp-1255"), # ANSI Hebrew; Hebrew (Windows)
  126. (1256, "cp-1256"), # ANSI Arabic; Arabic (Windows)
  127. (1257, "cp-1257"), # ANSI Baltic; Baltic (Windows)
  128. (1258, "cp-1258"), # ANSI/OEM Vietnamese; Vietnamese (Windows)
  129. (1361, "Johab"), # Korean (Johab)
  130. (10000, "macintosh"), # MAC Roman; Western European (Mac)
  131. (10001, "x-mac-japanese"), # Japanese (Mac)
  132. (10002, "x-mac-chinesetrad"), # MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
  133. (10003, "x-mac-korean"), # Korean (Mac)
  134. (10004, "x-mac-arabic"), # Arabic (Mac)
  135. (10005, "x-mac-hebrew"), # Hebrew (Mac)
  136. (10006, "x-mac-greek"), # Greek (Mac)
  137. (10007, "x-mac-cyrillic"), # Cyrillic (Mac)
  138. (10008, "x-mac-chinesesimp"), # MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
  139. (10010, "x-mac-romanian"), # Romanian (Mac)
  140. (10017, "x-mac-ukrainian"), # Ukrainian (Mac)
  141. (10021, "x-mac-thai"), # Thai (Mac)
  142. (10029, "x-mac-ce"), # MAC Latin 2; Central European (Mac)
  143. (10079, "x-mac-icelandic"), # Icelandic (Mac)
  144. (10081, "x-mac-turkish"), # Turkish (Mac)
  145. (10082, "x-mac-croatian"), # Croatian (Mac)
  146. (12000, "utf-32"), # Unicode UTF-32, little endian byte order; available only to managed applications
  147. (12001, "utf-32BE"), # Unicode UTF-32, big endian byte order; available only to managed applications
  148. (20000, "x-Chinese_CNS"), # CNS Taiwan; Chinese Traditional (CNS)
  149. (20001, "x-cp20001"), # TCA Taiwan
  150. (20002, "x_Chinese-Eten"), # Eten Taiwan; Chinese Traditional (Eten)
  151. (20003, "x-cp20003"), # IBM5550 Taiwan
  152. (20004, "x-cp20004"), # TeleText Taiwan
  153. (20005, "x-cp20005"), # Wang Taiwan
  154. (20105, "x-IA5"), # IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
  155. (20106, "x-IA5-German"), # IA5 German (7-bit)
  156. (20107, "x-IA5-Swedish"), # IA5 Swedish (7-bit)
  157. (20108, "x-IA5-Norwegian"), # IA5 Norwegian (7-bit)
  158. (20127, "us-ascii"), # US-ASCII (7-bit)
  159. (20261, "x-cp20261"), # T.61
  160. (20269, "x-cp20269"), # ISO 6937 Non-Spacing Accent
  161. (20273, "IBM273"), # IBM EBCDIC Germany
  162. (20277, "IBM277"), # IBM EBCDIC Denmark-Norway
  163. (20278, "IBM278"), # IBM EBCDIC Finland-Sweden
  164. (20280, "IBM280"), # IBM EBCDIC Italy
  165. (20284, "IBM284"), # IBM EBCDIC Latin America-Spain
  166. (20285, "IBM285"), # IBM EBCDIC United Kingdom
  167. (20290, "IBM290"), # IBM EBCDIC Japanese Katakana Extended
  168. (20297, "IBM297"), # IBM EBCDIC France
  169. (20420, "IBM420"), # IBM EBCDIC Arabic
  170. (20423, "IBM423"), # IBM EBCDIC Greek
  171. (20424, "IBM424"), # IBM EBCDIC Hebrew
  172. (20833, "x-EBCDIC-KoreanExtended"), # IBM EBCDIC Korean Extended
  173. (20838, "IBM-Thai"), # IBM EBCDIC Thai
  174. (20866, "koi8-r"), # Russian (KOI8-R); Cyrillic (KOI8-R)
  175. (20871, "IBM871"), # IBM EBCDIC Icelandic
  176. (20880, "IBM880"), # IBM EBCDIC Cyrillic Russian
  177. (20905, "IBM905"), # IBM EBCDIC Turkish
  178. (20924, "IBM00924"), # IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
  179. (20932, "EUC-JP"), # Japanese (JIS 0208-1990 and 0121-1990)
  180. (20936, "x-cp20936"), # Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
  181. (20949, "x-cp20949"), # Korean Wansung
  182. (21025, "cp1025"), # IBM EBCDIC Cyrillic Serbian-Bulgarian
  183. (21027, ""), # (deprecated)
  184. (21866, "koi8-u"), # Ukrainian (KOI8-U); Cyrillic (KOI8-U)
  185. (28591, "iso-8859-1"), # ISO 8859-1 Latin 1; Western European (ISO)
  186. (28592, "iso-8859-2"), # ISO 8859-2 Central European; Central European (ISO)
  187. (28593, "iso-8859-3"), # ISO 8859-3 Latin 3
  188. (28594, "iso-8859-4"), # ISO 8859-4 Baltic
  189. (28595, "iso-8859-5"), # ISO 8859-5 Cyrillic
  190. (28596, "iso-8859-6"), # ISO 8859-6 Arabic
  191. (28597, "iso-8859-7"), # ISO 8859-7 Greek
  192. (28598, "iso-8859-8"), # ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
  193. (28599, "iso-8859-9"), # ISO 8859-9 Turkish
  194. (28603, "iso-8859-13"), # ISO 8859-13 Estonian
  195. (28605, "iso-8859-15"), # ISO 8859-15 Latin 9
  196. (29001, "x-Europa"), # Europa 3
  197. (38598, "iso-8859-8-i"), # ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
  198. (50220, "iso-2022-jp"), # ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
  199. (50221, "csISO2022JP"), # ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
  200. (50222, "iso-2022-jp"), # ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
  201. (50225, "iso-2022-kr"), # ISO 2022 Korean
  202. (50227, "x-cp50227"), # ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
  203. (50229, ""), # ISO 2022 Traditional Chinese
  204. (50930, ""), # EBCDIC Japanese (Katakana) Extended
  205. (50931, ""), # EBCDIC US-Canada and Japanese
  206. (50933, ""), # EBCDIC Korean Extended and Korean
  207. (50935, ""), # EBCDIC Simplified Chinese Extended and Simplified Chinese
  208. (50936, ""), # EBCDIC Simplified Chinese
  209. (50937, ""), # EBCDIC US-Canada and Traditional Chinese
  210. (50939, ""), # EBCDIC Japanese (Latin) Extended and Japanese
  211. (51932, "euc-jp"), # EUC Japanese
  212. (51936, "EUC-CN"), # EUC Simplified Chinese; Chinese Simplified (EUC)
  213. (51949, "euc-kr"), # EUC Korean
  214. (51950, ""), # EUC Traditional Chinese
  215. (52936, "hz-gb-2312"), # HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
  216. (54936, "GB18030"), # Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
  217. (57002, "x-iscii-de"), # ISCII Devanagari
  218. (57003, "x-iscii-be"), # ISCII Bengali
  219. (57004, "x-iscii-ta"), # ISCII Tamil
  220. (57005, "x-iscii-te"), # ISCII Telugu
  221. (57006, "x-iscii-as"), # ISCII Assamese
  222. (57007, "x-iscii-or"), # ISCII Oriya
  223. (57008, "x-iscii-ka"), # ISCII Kannada
  224. (57009, "x-iscii-ma"), # ISCII Malayalam
  225. (57010, "x-iscii-gu"), # ISCII Gujarati
  226. (57011, "x-iscii-pa"), # ISCII Punjabi
  227. (65000, "utf-7"), # Unicode (UTF-7)
  228. (65001, "utf-8")] # Unicode (UTF-8)
  229. when false:
  230. # not needed yet:
  231. type
  232. CpInfo = object
  233. maxCharSize: int32
  234. defaultChar: array[0..1, char]
  235. leadByte: array[0..12-1, char]
  236. proc getCPInfo(codePage: CodePage, lpCPInfo: var CpInfo): int32 {.
  237. stdcall, importc: "GetCPInfo", dynlib: "kernel32".}
  238. proc nameToCodePage*(name: string): CodePage =
  239. var nameAsInt: int
  240. if parseInt(name, nameAsInt) == 0: nameAsInt = -1
  241. for no, na in items(winEncodings):
  242. if no == nameAsInt or eqEncodingNames(na, name): return CodePage(no)
  243. result = CodePage(-1)
  244. proc codePageToName*(c: CodePage): string =
  245. for no, na in items(winEncodings):
  246. if no == int(c):
  247. return if na.len != 0: na else: $no
  248. result = ""
  249. proc getACP(): CodePage {.stdcall, importc: "GetACP", dynlib: "kernel32".}
  250. proc getGetConsoleCP(): CodePage {.stdcall, importc: "GetConsoleCP",
  251. dynlib: "kernel32".}
  252. proc multiByteToWideChar(
  253. codePage: CodePage,
  254. dwFlags: int32,
  255. lpMultiByteStr: cstring,
  256. cbMultiByte: cint,
  257. lpWideCharStr: cstring,
  258. cchWideChar: cint): cint {.
  259. stdcall, importc: "MultiByteToWideChar", dynlib: "kernel32".}
  260. proc wideCharToMultiByte(
  261. codePage: CodePage,
  262. dwFlags: int32,
  263. lpWideCharStr: cstring,
  264. cchWideChar: cint,
  265. lpMultiByteStr: cstring,
  266. cbMultiByte: cint,
  267. lpDefaultChar: cstring = nil,
  268. lpUsedDefaultChar: pointer = nil): cint {.
  269. stdcall, importc: "WideCharToMultiByte", dynlib: "kernel32".}
  270. else:
  271. when defined(haiku):
  272. const iconvDll = "libiconv.so"
  273. elif defined(macosx):
  274. const iconvDll = "libiconv.dylib"
  275. else:
  276. const iconvDll = "(libc.so.6|libiconv.so)"
  277. const
  278. E2BIG = 7.cint
  279. EINVAL = 22.cint
  280. when defined(linux):
  281. const EILSEQ = 84.cint
  282. elif defined(macosx):
  283. const EILSEQ = 92.cint
  284. elif defined(bsd):
  285. const EILSEQ = 86.cint
  286. elif defined(solaris):
  287. const EILSEQ = 88.cint
  288. elif defined(haiku):
  289. const EILSEQ = -2147454938.cint
  290. var errno {.importc, header: "<errno.h>".}: cint
  291. when defined(bsd):
  292. {.pragma: importIconv, cdecl, header: "<iconv.h>".}
  293. when defined(openbsd):
  294. {.passL: "-liconv".}
  295. else:
  296. {.pragma: importIconv, cdecl, dynlib: iconvDll.}
  297. proc iconvOpen(tocode, fromcode: cstring): EncodingConverter {.
  298. importc: "iconv_open", importIconv.}
  299. proc iconvClose(c: EncodingConverter) {.
  300. importc: "iconv_close", importIconv.}
  301. proc iconv(c: EncodingConverter, inbuf: ptr cstring, inbytesLeft: ptr csize_t,
  302. outbuf: ptr cstring, outbytesLeft: ptr csize_t): csize_t {.
  303. importc: "iconv", importIconv.}
  304. proc getCurrentEncoding*(uiApp = false): string =
  305. ## Retrieves the current encoding. On Unix, "UTF-8" is always returned.
  306. ## The `uiApp` parameter is Windows specific. If true, the UI's code-page
  307. ## is returned, if false, the Console's code-page is returned.
  308. when defined(windows):
  309. result = codePageToName(if uiApp: getACP() else: getGetConsoleCP())
  310. else:
  311. result = "UTF-8"
  312. proc open*(destEncoding = "UTF-8", srcEncoding = "CP1252"): EncodingConverter =
  313. ## Opens a converter that can convert from `srcEncoding` to `destEncoding`.
  314. ## Raises `IOError` if it cannot fulfill the request.
  315. when not defined(windows):
  316. result = iconvOpen(destEncoding, srcEncoding)
  317. if result == nil:
  318. raise newException(EncodingError,
  319. "cannot create encoding converter from " &
  320. srcEncoding & " to " & destEncoding)
  321. else:
  322. result.dest = nameToCodePage(destEncoding)
  323. result.src = nameToCodePage(srcEncoding)
  324. if int(result.dest) == -1:
  325. raise newException(EncodingError,
  326. "cannot find encoding " & destEncoding)
  327. if int(result.src) == -1:
  328. raise newException(EncodingError,
  329. "cannot find encoding " & srcEncoding)
  330. proc close*(c: EncodingConverter) =
  331. ## Frees the resources the converter `c` holds.
  332. when not defined(windows):
  333. iconvClose(c)
  334. when defined(windows):
  335. proc convertToWideString(codePage: CodePage, s: string): string =
  336. # educated guess of capacity:
  337. var cap = s.len + s.len shr 2
  338. result = newString(cap*2)
  339. # convert to utf-16 LE
  340. var m = multiByteToWideChar(codePage,
  341. dwFlags = 0'i32,
  342. lpMultiByteStr = cstring(s),
  343. cbMultiByte = cint(s.len),
  344. lpWideCharStr = cstring(result),
  345. cchWideChar = cint(cap))
  346. if m == 0:
  347. # try again; ask for capacity:
  348. cap = multiByteToWideChar(codePage,
  349. dwFlags = 0'i32,
  350. lpMultiByteStr = cstring(s),
  351. cbMultiByte = cint(s.len),
  352. lpWideCharStr = nil,
  353. cchWideChar = cint(0))
  354. # and do the conversion properly:
  355. result = newString(cap*2)
  356. m = multiByteToWideChar(codePage,
  357. dwFlags = 0'i32,
  358. lpMultiByteStr = cstring(s),
  359. cbMultiByte = cint(s.len),
  360. lpWideCharStr = cstring(result),
  361. cchWideChar = cint(cap))
  362. if m == 0: raiseOSError(osLastError())
  363. setLen(result, m*2)
  364. elif m <= cap:
  365. setLen(result, m*2)
  366. else:
  367. assert(false) # cannot happen
  368. proc convertFromWideString(codePage: CodePage, s: string): string =
  369. let charCount = s.len div 2
  370. var cap = s.len + s.len shr 2
  371. result = newString(cap)
  372. var m = wideCharToMultiByte(codePage,
  373. dwFlags = 0'i32,
  374. lpWideCharStr = cstring(s),
  375. cchWideChar = cint(charCount),
  376. lpMultiByteStr = cstring(result),
  377. cbMultiByte = cap.cint)
  378. if m == 0:
  379. # try again; ask for capacity:
  380. cap = wideCharToMultiByte(codePage,
  381. dwFlags = 0'i32,
  382. lpWideCharStr = cstring(s),
  383. cchWideChar = cint(charCount),
  384. lpMultiByteStr = nil,
  385. cbMultiByte = cint(0))
  386. # and do the conversion properly:
  387. result = newString(cap)
  388. m = wideCharToMultiByte(codePage,
  389. dwFlags = 0'i32,
  390. lpWideCharStr = cstring(s),
  391. cchWideChar = cint(charCount),
  392. lpMultiByteStr = cstring(result),
  393. cbMultiByte = cap.cint)
  394. if m == 0: raiseOSError(osLastError())
  395. setLen(result, m)
  396. elif m <= cap:
  397. setLen(result, m)
  398. else:
  399. assert(false) # cannot happen
  400. proc convertWin(codePageFrom: CodePage, codePageTo: CodePage,
  401. s: string): string =
  402. # special case: empty string: needed because MultiByteToWideChar, WideCharToMultiByte
  403. # return 0 in case of error
  404. if s.len == 0: return ""
  405. # multiByteToWideChar does not support encoding from code pages below
  406. let unsupported = [1201, 12000, 12001]
  407. if int(codePageFrom) in unsupported:
  408. let message = "encoding from " & codePageToName(codePageFrom) & " is not supported on windows"
  409. raise newException(EncodingError, message)
  410. if int(codePageTo) in unsupported:
  411. let message = "encoding to " & codePageToName(codePageTo) & " is not supported on windows"
  412. raise newException(EncodingError, message)
  413. # in case it's already UTF-16 little endian - conversion can be simplified
  414. let wideString = if int(codePageFrom) == 1200: s
  415. else: convertToWideString(codePageFrom, s)
  416. return if int(codePageTo) == 1200: wideString
  417. else: convertFromWideString(codePageTo, wideString)
  418. proc convert*(c: EncodingConverter, s: string): string =
  419. result = convertWin(c.src, c.dest, s)
  420. else:
  421. proc convert*(c: EncodingConverter, s: string): string =
  422. ## Converts `s` to `destEncoding` that was given to the converter `c`. It
  423. ## assumes that `s` is in `srcEncoding`.
  424. ##
  425. ## .. warning:: UTF-16BE and UTF-32 conversions are not supported on Windows.
  426. result = newString(s.len)
  427. var inLen = csize_t len(s)
  428. var outLen = csize_t len(result)
  429. var src = cstring(s)
  430. var dst = cstring(result)
  431. var iconvres: csize_t
  432. while inLen > 0:
  433. iconvres = iconv(c, addr src, addr inLen, addr dst, addr outLen)
  434. if iconvres == high(csize_t):
  435. var lerr = errno
  436. if lerr == EILSEQ or lerr == EINVAL:
  437. # unknown char, skip
  438. dst[0] = src[0]
  439. src = cast[cstring](cast[int](src) + 1)
  440. dst = cast[cstring](cast[int](dst) + 1)
  441. dec(inLen)
  442. dec(outLen)
  443. elif lerr == E2BIG:
  444. var offset = cast[int](dst) - cast[int](cstring(result))
  445. setLen(result, len(result) + inLen.int * 2 + 5)
  446. # 5 is minimally one utf-8 char
  447. dst = cast[cstring](cast[int](cstring(result)) + offset)
  448. outLen = csize_t(len(result) - offset)
  449. else:
  450. raiseOSError(lerr.OSErrorCode)
  451. # iconv has a buffer that needs flushing, specially if the last char is
  452. # not '\0'
  453. discard iconv(c, nil, nil, addr dst, addr outLen)
  454. if iconvres == high(csize_t) and errno == E2BIG:
  455. var offset = cast[int](dst) - cast[int](cstring(result))
  456. setLen(result, len(result) + inLen.int * 2 + 5)
  457. # 5 is minimally one utf-8 char
  458. dst = cast[cstring](cast[int](cstring(result)) + offset)
  459. outLen = csize_t(len(result) - offset)
  460. discard iconv(c, nil, nil, addr dst, addr outLen)
  461. # trim output buffer
  462. setLen(result, len(result) - outLen.int)
  463. proc convert*(s: string, destEncoding = "UTF-8",
  464. srcEncoding = "CP1252"): string =
  465. ## Converts `s` to `destEncoding`. It assumed that `s` is in `srcEncoding`.
  466. ## This opens a converter, uses it and closes it again and is thus more
  467. ## convenient but also likely less efficient than re-using a converter.
  468. ##
  469. ## .. warning:: UTF-16BE and UTF-32 conversions are not supported on Windows.
  470. var c = open(destEncoding, srcEncoding)
  471. try:
  472. result = convert(c, s)
  473. finally:
  474. close(c)