encodings.nim 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2015 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## Converts between different character encodings. On UNIX, this uses
  10. ## the `iconv`:idx: library, on Windows the Windows API.
  11. import os
  12. when not defined(windows):
  13. type
  14. ConverterObj = object
  15. EncodingConverter* = ptr ConverterObj ## can convert between two character sets
  16. else:
  17. type
  18. CodePage = distinct int32
  19. EncodingConverter* = object
  20. dest, src: CodePage
  21. type
  22. EncodingError* = object of ValueError ## exception that is raised
  23. ## for encoding errors
  24. when defined(windows):
  25. import parseutils, strutils
  26. proc eqEncodingNames(a, b: string): bool =
  27. var i = 0
  28. var j = 0
  29. while i < a.len and j < b.len:
  30. if a[i] in {'-', '_'}: inc i
  31. if b[j] in {'-', '_'}: inc j
  32. if i < a.len and j < b.len and
  33. a[i].toLowerAscii != b[j].toLowerAscii:
  34. return false
  35. inc i
  36. inc j
  37. result = i == a.len and j == b.len
  38. const
  39. winEncodings = [
  40. (1, "OEMCP"), # current OEM codepage
  41. (037, "IBM037"), # IBM EBCDIC US-Canada
  42. (437, "IBM437"), # OEM United States
  43. (500, "IBM500"), # IBM EBCDIC International
  44. (708, "ASMO-708"), # Arabic (ASMO 708)
  45. (709, "ASMO_449"), # Arabic (ASMO-449+, BCON V4)
  46. (710, ""), # Arabic - Transparent Arabic
  47. (720, "DOS-720"), # Arabic (Transparent ASMO); Arabic (DOS)
  48. (737, "ibm737"), # OEM Greek (formerly 437G); Greek (DOS)
  49. (775, "ibm775"), # OEM Baltic; Baltic (DOS)
  50. (850, "ibm850"), # OEM Multilingual Latin 1; Western European (DOS)
  51. (852, "ibm852"), # OEM Latin 2; Central European (DOS)
  52. (855, "IBM855"), # OEM Cyrillic (primarily Russian)
  53. (857, "ibm857"), # OEM Turkish; Turkish (DOS)
  54. (858, "IBM00858"), # OEM Multilingual Latin 1 + Euro symbol
  55. (860, "IBM860"), # OEM Portuguese; Portuguese (DOS)
  56. (861, "ibm861"), # OEM Icelandic; Icelandic (DOS)
  57. (862, "DOS-862"), # OEM Hebrew; Hebrew (DOS)
  58. (863, "IBM863"), # OEM French Canadian; French Canadian (DOS)
  59. (864, "IBM864"), # OEM Arabic; Arabic (864)
  60. (865, "IBM865"), # OEM Nordic; Nordic (DOS)
  61. (866, "cp866"), # OEM Russian; Cyrillic (DOS)
  62. (869, "ibm869"), # OEM Modern Greek; Greek, Modern (DOS)
  63. (870, "IBM870"), # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
  64. (874, "windows-874"), # ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows)
  65. (875, "cp875"), # IBM EBCDIC Greek Modern
  66. (932, "shift_jis"), # ANSI/OEM Japanese; Japanese (Shift-JIS)
  67. (936, "gb2312"), # ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
  68. (949, "ks_c_5601-1987"), # ANSI/OEM Korean (Unified Hangul Code)
  69. (950, "big5"), # ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
  70. (1026, "IBM1026"), # IBM EBCDIC Turkish (Latin 5)
  71. (1047, "IBM01047"), # IBM EBCDIC Latin 1/Open System
  72. (1140, "IBM01140"), # IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
  73. (1141, "IBM01141"), # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
  74. (1142, "IBM01142"), # IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
  75. (1143, "IBM01143"), # IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
  76. (1144, "IBM01144"), # IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
  77. (1145, "IBM01145"), # IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
  78. (1146, "IBM01146"), # IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
  79. (1147, "IBM01147"), # IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
  80. (1148, "IBM01148"), # IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
  81. (1149, "IBM01149"), # IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
  82. (1200, "utf-16"), # Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
  83. (1201, "unicodeFFFE"), # Unicode UTF-16, big endian byte order; available only to managed applications
  84. (1250, "windows-1250"), # ANSI Central European; Central European (Windows)
  85. (1251, "windows-1251"), # ANSI Cyrillic; Cyrillic (Windows)
  86. (1252, "windows-1252"), # ANSI Latin 1; Western European (Windows)
  87. (1253, "windows-1253"), # ANSI Greek; Greek (Windows)
  88. (1254, "windows-1254"), # ANSI Turkish; Turkish (Windows)
  89. (1255, "windows-1255"), # ANSI Hebrew; Hebrew (Windows)
  90. (1256, "windows-1256"), # ANSI Arabic; Arabic (Windows)
  91. (1257, "windows-1257"), # ANSI Baltic; Baltic (Windows)
  92. (1258, "windows-1258"), # ANSI/OEM Vietnamese; Vietnamese (Windows)
  93. (1250, "cp-1250"), # ANSI Central European; Central European (Windows)
  94. (1251, "cp-1251"), # ANSI Cyrillic; Cyrillic (Windows)
  95. (1252, "cp-1252"), # ANSI Latin 1; Western European (Windows)
  96. (1253, "cp-1253"), # ANSI Greek; Greek (Windows)
  97. (1254, "cp-1254"), # ANSI Turkish; Turkish (Windows)
  98. (1255, "cp-1255"), # ANSI Hebrew; Hebrew (Windows)
  99. (1256, "cp-1256"), # ANSI Arabic; Arabic (Windows)
  100. (1257, "cp-1257"), # ANSI Baltic; Baltic (Windows)
  101. (1258, "cp-1258"), # ANSI/OEM Vietnamese; Vietnamese (Windows)
  102. (1361, "Johab"), # Korean (Johab)
  103. (10000, "macintosh"), # MAC Roman; Western European (Mac)
  104. (10001, "x-mac-japanese"), # Japanese (Mac)
  105. (10002, "x-mac-chinesetrad"), # MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
  106. (10003, "x-mac-korean"), # Korean (Mac)
  107. (10004, "x-mac-arabic"), # Arabic (Mac)
  108. (10005, "x-mac-hebrew"), # Hebrew (Mac)
  109. (10006, "x-mac-greek"), # Greek (Mac)
  110. (10007, "x-mac-cyrillic"), # Cyrillic (Mac)
  111. (10008, "x-mac-chinesesimp"), # MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
  112. (10010, "x-mac-romanian"), # Romanian (Mac)
  113. (10017, "x-mac-ukrainian"), # Ukrainian (Mac)
  114. (10021, "x-mac-thai"), # Thai (Mac)
  115. (10029, "x-mac-ce"), # MAC Latin 2; Central European (Mac)
  116. (10079, "x-mac-icelandic"), # Icelandic (Mac)
  117. (10081, "x-mac-turkish"), # Turkish (Mac)
  118. (10082, "x-mac-croatian"), # Croatian (Mac)
  119. (12000, "utf-32"), # Unicode UTF-32, little endian byte order; available only to managed applications
  120. (12001, "utf-32BE"), # Unicode UTF-32, big endian byte order; available only to managed applications
  121. (20000, "x-Chinese_CNS"), # CNS Taiwan; Chinese Traditional (CNS)
  122. (20001, "x-cp20001"), # TCA Taiwan
  123. (20002, "x_Chinese-Eten"), # Eten Taiwan; Chinese Traditional (Eten)
  124. (20003, "x-cp20003"), # IBM5550 Taiwan
  125. (20004, "x-cp20004"), # TeleText Taiwan
  126. (20005, "x-cp20005"), # Wang Taiwan
  127. (20105, "x-IA5"), # IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
  128. (20106, "x-IA5-German"), # IA5 German (7-bit)
  129. (20107, "x-IA5-Swedish"), # IA5 Swedish (7-bit)
  130. (20108, "x-IA5-Norwegian"), # IA5 Norwegian (7-bit)
  131. (20127, "us-ascii"), # US-ASCII (7-bit)
  132. (20261, "x-cp20261"), # T.61
  133. (20269, "x-cp20269"), # ISO 6937 Non-Spacing Accent
  134. (20273, "IBM273"), # IBM EBCDIC Germany
  135. (20277, "IBM277"), # IBM EBCDIC Denmark-Norway
  136. (20278, "IBM278"), # IBM EBCDIC Finland-Sweden
  137. (20280, "IBM280"), # IBM EBCDIC Italy
  138. (20284, "IBM284"), # IBM EBCDIC Latin America-Spain
  139. (20285, "IBM285"), # IBM EBCDIC United Kingdom
  140. (20290, "IBM290"), # IBM EBCDIC Japanese Katakana Extended
  141. (20297, "IBM297"), # IBM EBCDIC France
  142. (20420, "IBM420"), # IBM EBCDIC Arabic
  143. (20423, "IBM423"), # IBM EBCDIC Greek
  144. (20424, "IBM424"), # IBM EBCDIC Hebrew
  145. (20833, "x-EBCDIC-KoreanExtended"), # IBM EBCDIC Korean Extended
  146. (20838, "IBM-Thai"), # IBM EBCDIC Thai
  147. (20866, "koi8-r"), # Russian (KOI8-R); Cyrillic (KOI8-R)
  148. (20871, "IBM871"), # IBM EBCDIC Icelandic
  149. (20880, "IBM880"), # IBM EBCDIC Cyrillic Russian
  150. (20905, "IBM905"), # IBM EBCDIC Turkish
  151. (20924, "IBM00924"), # IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
  152. (20932, "EUC-JP"), # Japanese (JIS 0208-1990 and 0121-1990)
  153. (20936, "x-cp20936"), # Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
  154. (20949, "x-cp20949"), # Korean Wansung
  155. (21025, "cp1025"), # IBM EBCDIC Cyrillic Serbian-Bulgarian
  156. (21027, ""), # (deprecated)
  157. (21866, "koi8-u"), # Ukrainian (KOI8-U); Cyrillic (KOI8-U)
  158. (28591, "iso-8859-1"), # ISO 8859-1 Latin 1; Western European (ISO)
  159. (28592, "iso-8859-2"), # ISO 8859-2 Central European; Central European (ISO)
  160. (28593, "iso-8859-3"), # ISO 8859-3 Latin 3
  161. (28594, "iso-8859-4"), # ISO 8859-4 Baltic
  162. (28595, "iso-8859-5"), # ISO 8859-5 Cyrillic
  163. (28596, "iso-8859-6"), # ISO 8859-6 Arabic
  164. (28597, "iso-8859-7"), # ISO 8859-7 Greek
  165. (28598, "iso-8859-8"), # ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
  166. (28599, "iso-8859-9"), # ISO 8859-9 Turkish
  167. (28603, "iso-8859-13"), # ISO 8859-13 Estonian
  168. (28605, "iso-8859-15"), # ISO 8859-15 Latin 9
  169. (29001, "x-Europa"), # Europa 3
  170. (38598, "iso-8859-8-i"), # ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
  171. (50220, "iso-2022-jp"), # ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
  172. (50221, "csISO2022JP"), # ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
  173. (50222, "iso-2022-jp"), # ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
  174. (50225, "iso-2022-kr"), # ISO 2022 Korean
  175. (50227, "x-cp50227"), # ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
  176. (50229, ""), # ISO 2022 Traditional Chinese
  177. (50930, ""), # EBCDIC Japanese (Katakana) Extended
  178. (50931, ""), # EBCDIC US-Canada and Japanese
  179. (50933, ""), # EBCDIC Korean Extended and Korean
  180. (50935, ""), # EBCDIC Simplified Chinese Extended and Simplified Chinese
  181. (50936, ""), # EBCDIC Simplified Chinese
  182. (50937, ""), # EBCDIC US-Canada and Traditional Chinese
  183. (50939, ""), # EBCDIC Japanese (Latin) Extended and Japanese
  184. (51932, "euc-jp"), # EUC Japanese
  185. (51936, "EUC-CN"), # EUC Simplified Chinese; Chinese Simplified (EUC)
  186. (51949, "euc-kr"), # EUC Korean
  187. (51950, ""), # EUC Traditional Chinese
  188. (52936, "hz-gb-2312"), # HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
  189. (54936, "GB18030"), # Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
  190. (57002, "x-iscii-de"), # ISCII Devanagari
  191. (57003, "x-iscii-be"), # ISCII Bengali
  192. (57004, "x-iscii-ta"), # ISCII Tamil
  193. (57005, "x-iscii-te"), # ISCII Telugu
  194. (57006, "x-iscii-as"), # ISCII Assamese
  195. (57007, "x-iscii-or"), # ISCII Oriya
  196. (57008, "x-iscii-ka"), # ISCII Kannada
  197. (57009, "x-iscii-ma"), # ISCII Malayalam
  198. (57010, "x-iscii-gu"), # ISCII Gujarati
  199. (57011, "x-iscii-pa"), # ISCII Punjabi
  200. (65000, "utf-7"), # Unicode (UTF-7)
  201. (65001, "utf-8")] # Unicode (UTF-8)
  202. when false:
  203. # not needed yet:
  204. type
  205. CpInfo = object
  206. maxCharSize: int32
  207. defaultChar: array[0..1, char]
  208. leadByte: array[0..12-1, char]
  209. proc getCPInfo(codePage: CodePage, lpCPInfo: var CpInfo): int32 {.
  210. stdcall, importc: "GetCPInfo", dynlib: "kernel32".}
  211. proc nameToCodePage*(name: string): CodePage =
  212. var nameAsInt: int
  213. if parseInt(name, nameAsInt) == 0: nameAsInt = -1
  214. for no, na in items(winEncodings):
  215. if no == nameAsInt or eqEncodingNames(na, name): return CodePage(no)
  216. result = CodePage(-1)
  217. proc codePageToName*(c: CodePage): string =
  218. for no, na in items(winEncodings):
  219. if no == int(c):
  220. return if na.len != 0: na else: $no
  221. result = ""
  222. proc getACP(): CodePage {.stdcall, importc: "GetACP", dynlib: "kernel32".}
  223. proc getGetConsoleCP(): CodePage {.stdcall, importc: "GetConsoleCP",
  224. dynlib: "kernel32".}
  225. proc multiByteToWideChar(
  226. codePage: CodePage,
  227. dwFlags: int32,
  228. lpMultiByteStr: cstring,
  229. cbMultiByte: cint,
  230. lpWideCharStr: cstring,
  231. cchWideChar: cint): cint {.
  232. stdcall, importc: "MultiByteToWideChar", dynlib: "kernel32".}
  233. proc wideCharToMultiByte(
  234. codePage: CodePage,
  235. dwFlags: int32,
  236. lpWideCharStr: cstring,
  237. cchWideChar: cint,
  238. lpMultiByteStr: cstring,
  239. cbMultiByte: cint,
  240. lpDefaultChar: cstring = nil,
  241. lpUsedDefaultChar: pointer = nil): cint {.
  242. stdcall, importc: "WideCharToMultiByte", dynlib: "kernel32".}
  243. else:
  244. when defined(haiku):
  245. const iconvDll = "libiconv.so"
  246. elif defined(macosx):
  247. const iconvDll = "libiconv.dylib"
  248. else:
  249. const iconvDll = "(libc.so.6|libiconv.so)"
  250. const
  251. E2BIG = 7.cint
  252. EINVAL = 22.cint
  253. when defined(linux):
  254. const EILSEQ = 84.cint
  255. elif defined(macosx):
  256. const EILSEQ = 92.cint
  257. elif defined(bsd):
  258. const EILSEQ = 86.cint
  259. elif defined(solaris):
  260. const EILSEQ = 88.cint
  261. elif defined(haiku):
  262. const EILSEQ = -2147454938.cint
  263. var errno {.importc, header: "<errno.h>".}: cint
  264. when defined(freebsd) or defined(netbsd):
  265. {.pragma: importIconv, cdecl, header: "<iconv.h>".}
  266. else:
  267. {.pragma: importIconv, cdecl, dynlib: iconvDll.}
  268. proc iconvOpen(tocode, fromcode: cstring): EncodingConverter {.
  269. importc: "iconv_open", importIconv.}
  270. proc iconvClose(c: EncodingConverter) {.
  271. importc: "iconv_close", importIconv.}
  272. proc iconv(c: EncodingConverter, inbuf: ptr cstring, inbytesLeft: ptr csize_t,
  273. outbuf: ptr cstring, outbytesLeft: ptr csize_t): csize_t {.
  274. importc: "iconv", importIconv.}
  275. proc getCurrentEncoding*(uiApp = false): string =
  276. ## retrieves the current encoding. On Unix, always "UTF-8" is returned.
  277. ## The `uiApp` parameter is Windows specific. If true, the UI's code-page
  278. ## is returned, if false, the Console's code-page is returned.
  279. when defined(windows):
  280. result = codePageToName(if uiApp: getACP() else: getGetConsoleCP())
  281. else:
  282. result = "UTF-8"
  283. proc open*(destEncoding = "UTF-8", srcEncoding = "CP1252"): EncodingConverter =
  284. ## opens a converter that can convert from `srcEncoding` to `destEncoding`.
  285. ## Raises `IOError` if it cannot fulfill the request.
  286. when not defined(windows):
  287. result = iconvOpen(destEncoding, srcEncoding)
  288. if result == nil:
  289. raise newException(EncodingError,
  290. "cannot create encoding converter from " &
  291. srcEncoding & " to " & destEncoding)
  292. else:
  293. result.dest = nameToCodePage(destEncoding)
  294. result.src = nameToCodePage(srcEncoding)
  295. if int(result.dest) == -1:
  296. raise newException(EncodingError,
  297. "cannot find encoding " & destEncoding)
  298. if int(result.src) == -1:
  299. raise newException(EncodingError,
  300. "cannot find encoding " & srcEncoding)
  301. proc close*(c: EncodingConverter) =
  302. ## frees the resources the converter `c` holds.
  303. when not defined(windows):
  304. iconvClose(c)
  305. when defined(windows):
  306. proc convertToWideString(codePage: CodePage, s: string): string =
  307. # educated guess of capacity:
  308. var cap = s.len + s.len shr 2
  309. result = newString(cap*2)
  310. # convert to utf-16 LE
  311. var m = multiByteToWideChar(codePage,
  312. dwFlags = 0'i32,
  313. lpMultiByteStr = cstring(s),
  314. cbMultiByte = cint(s.len),
  315. lpWideCharStr = cstring(result),
  316. cchWideChar = cint(cap))
  317. if m == 0:
  318. # try again; ask for capacity:
  319. cap = multiByteToWideChar(codePage,
  320. dwFlags = 0'i32,
  321. lpMultiByteStr = cstring(s),
  322. cbMultiByte = cint(s.len),
  323. lpWideCharStr = nil,
  324. cchWideChar = cint(0))
  325. # and do the conversion properly:
  326. result = newString(cap*2)
  327. m = multiByteToWideChar(codePage,
  328. dwFlags = 0'i32,
  329. lpMultiByteStr = cstring(s),
  330. cbMultiByte = cint(s.len),
  331. lpWideCharStr = cstring(result),
  332. cchWideChar = cint(cap))
  333. if m == 0: raiseOSError(osLastError())
  334. setLen(result, m*2)
  335. elif m <= cap:
  336. setLen(result, m*2)
  337. else:
  338. assert(false) # cannot happen
  339. proc convertFromWideString(codePage: CodePage, s: string): string =
  340. let charCount = s.len div 2
  341. var cap = s.len + s.len shr 2
  342. result = newString(cap)
  343. var m = wideCharToMultiByte(codePage,
  344. dwFlags = 0'i32,
  345. lpWideCharStr = cstring(s),
  346. cchWideChar = cint(charCount),
  347. lpMultiByteStr = cstring(result),
  348. cbMultiByte = cap.cint)
  349. if m == 0:
  350. # try again; ask for capacity:
  351. cap = wideCharToMultiByte(codePage,
  352. dwFlags = 0'i32,
  353. lpWideCharStr = cstring(s),
  354. cchWideChar = cint(charCount),
  355. lpMultiByteStr = nil,
  356. cbMultiByte = cint(0))
  357. # and do the conversion properly:
  358. result = newString(cap)
  359. m = wideCharToMultiByte(codePage,
  360. dwFlags = 0'i32,
  361. lpWideCharStr = cstring(s),
  362. cchWideChar = cint(charCount),
  363. lpMultiByteStr = cstring(result),
  364. cbMultiByte = cap.cint)
  365. if m == 0: raiseOSError(osLastError())
  366. setLen(result, m)
  367. elif m <= cap:
  368. setLen(result, m)
  369. else:
  370. assert(false) # cannot happen
  371. proc convertWin(codePageFrom: CodePage, codePageTo: CodePage,
  372. s: string): string =
  373. # special case: empty string: needed because MultiByteToWideChar, WideCharToMultiByte
  374. # return 0 in case of error
  375. if s.len == 0: return ""
  376. # multiByteToWideChar does not support encoding from code pages below
  377. let unsupported = [1201, 12000, 12001]
  378. if int(codePageFrom) in unsupported:
  379. let message = "encoding from " & codePageToName(codePageFrom) & " is not supported on windows"
  380. raise newException(EncodingError, message)
  381. if int(codePageTo) in unsupported:
  382. let message = "encoding to " & codePageToName(codePageTo) & " is not supported on windows"
  383. raise newException(EncodingError, message)
  384. # in case it's already UTF-16 little endian - conversion can be simplified
  385. let wideString = if int(codePageFrom) == 1200: s
  386. else: convertToWideString(codePageFrom, s)
  387. return if int(codePageTo) == 1200: wideString
  388. else: convertFromWideString(codePageTo, wideString)
  389. proc convert*(c: EncodingConverter, s: string): string =
  390. ## converts `s` to `destEncoding` that was given to the converter `c`. It
  391. ## assumed that `s` is in `srcEncoding`.
  392. ## utf-16BE, utf-32 conversions not supported on windows
  393. result = convertWin(c.src, c.dest, s)
  394. else:
  395. proc convert*(c: EncodingConverter, s: string): string =
  396. result = newString(s.len)
  397. var inLen = csize_t len(s)
  398. var outLen = csize_t len(result)
  399. var src = cstring(s)
  400. var dst = cstring(result)
  401. var iconvres: csize_t
  402. while inLen > 0:
  403. iconvres = iconv(c, addr src, addr inLen, addr dst, addr outLen)
  404. if iconvres == high(csize_t):
  405. var lerr = errno
  406. if lerr == EILSEQ or lerr == EINVAL:
  407. # unknown char, skip
  408. dst[0] = src[0]
  409. src = cast[cstring](cast[int](src) + 1)
  410. dst = cast[cstring](cast[int](dst) + 1)
  411. dec(inLen)
  412. dec(outLen)
  413. elif lerr == E2BIG:
  414. var offset = cast[int](dst) - cast[int](cstring(result))
  415. setLen(result, len(result) + inLen.int * 2 + 5)
  416. # 5 is minimally one utf-8 char
  417. dst = cast[cstring](cast[int](cstring(result)) + offset)
  418. outLen = csize_t(len(result) - offset)
  419. else:
  420. raiseOSError(lerr.OSErrorCode)
  421. # iconv has a buffer that needs flushing, specially if the last char is
  422. # not '\0'
  423. discard iconv(c, nil, nil, addr dst, addr outLen)
  424. if iconvres == high(csize_t) and errno == E2BIG:
  425. var offset = cast[int](dst) - cast[int](cstring(result))
  426. setLen(result, len(result) + inLen.int * 2 + 5)
  427. # 5 is minimally one utf-8 char
  428. dst = cast[cstring](cast[int](cstring(result)) + offset)
  429. outLen = csize_t(len(result) - offset)
  430. discard iconv(c, nil, nil, addr dst, addr outLen)
  431. # trim output buffer
  432. setLen(result, len(result) - outLen.int)
  433. proc convert*(s: string, destEncoding = "UTF-8",
  434. srcEncoding = "CP1252"): string =
  435. ## converts `s` to `destEncoding`. It assumed that `s` is in `srcEncoding`.
  436. ## This opens a converter, uses it and closes it again and is thus more
  437. ## convenient but also likely less efficient than re-using a converter.
  438. ## utf-16BE, utf-32 conversions not supported on windows
  439. var c = open(destEncoding, srcEncoding)
  440. try:
  441. result = convert(c, s)
  442. finally:
  443. close(c)
  444. when not defined(testing) and isMainModule:
  445. let
  446. orig = "öäüß"
  447. cp1252 = convert(orig, "CP1252", "UTF-8")
  448. ibm850 = convert(cp1252, "ibm850", "CP1252")
  449. current = getCurrentEncoding()
  450. echo "Original string from source code: ", orig
  451. echo "Forced ibm850 encoding: ", ibm850
  452. echo "Current encoding: ", current
  453. echo "From ibm850 to current: ", convert(ibm850, current, "ibm850")
  454. when not defined(testing) and isMainModule and defined(windows):
  455. block should_throw_on_unsupported_conversions:
  456. let original = "some string"
  457. doAssertRaises(EncodingError):
  458. discard convert(original, "utf-8", "utf-32")
  459. doAssertRaises(EncodingError):
  460. discard convert(original, "utf-8", "unicodeFFFE")
  461. doAssertRaises(EncodingError):
  462. discard convert(original, "utf-8", "utf-32BE")
  463. doAssertRaises(EncodingError):
  464. discard convert(original, "unicodeFFFE", "utf-8")
  465. doAssertRaises(EncodingError):
  466. discard convert(original, "utf-32", "utf-8")
  467. doAssertRaises(EncodingError):
  468. discard convert(original, "utf-32BE", "utf-8")
  469. block should_convert_from_utf16_to_utf8:
  470. let original = "\x42\x04\x35\x04\x41\x04\x42\x04" # utf-16 little endian test string "тест"
  471. let result = convert(original, "utf-8", "utf-16")
  472. doAssert(result == "\xd1\x82\xd0\xb5\xd1\x81\xd1\x82")
  473. block should_convert_from_utf16_to_win1251:
  474. let original = "\x42\x04\x35\x04\x41\x04\x42\x04" # utf-16 little endian test string "тест"
  475. let result = convert(original, "windows-1251", "utf-16")
  476. doAssert(result == "\xf2\xe5\xf1\xf2")
  477. block should_convert_from_win1251_to_koi8r:
  478. let original = "\xf2\xe5\xf1\xf2" # win1251 test string "тест"
  479. let result = convert(original, "koi8-r", "windows-1251")
  480. doAssert(result == "\xd4\xc5\xd3\xd4")
  481. block should_convert_from_koi8r_to_win1251:
  482. let original = "\xd4\xc5\xd3\xd4" # koi8r test string "тест"
  483. let result = convert(original, "windows-1251", "koi8-r")
  484. doAssert(result == "\xf2\xe5\xf1\xf2")
  485. block should_convert_from_utf8_to_win1251:
  486. let original = "\xd1\x82\xd0\xb5\xd1\x81\xd1\x82" # utf-8 test string "тест"
  487. let result = convert(original, "windows-1251", "utf-8")
  488. doAssert(result == "\xf2\xe5\xf1\xf2")
  489. block should_convert_from_utf8_to_utf16:
  490. let original = "\xd1\x82\xd0\xb5\xd1\x81\xd1\x82" # utf-8 test string "тест"
  491. let result = convert(original, "utf-16", "utf-8")
  492. doAssert(result == "\x42\x04\x35\x04\x41\x04\x42\x04")
  493. block should_handle_empty_string_for_any_conversion:
  494. let original = ""
  495. var result = convert(original, "utf-16", "utf-8")
  496. doAssert(result == "")
  497. result = convert(original, "utf-8", "utf-16")
  498. doAssert(result == "")
  499. result = convert(original, "windows-1251", "koi8-r")
  500. doAssert(result == "")