encodings.nim 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2015 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## Converts between different character encodings. On UNIX, this uses
  10. ## the `iconv`:idx: library, on Windows the Windows API.
  11. import os, parseutils, strutils
  12. when not defined(windows):
  13. type
  14. ConverterObj = object
  15. EncodingConverter* = ptr ConverterObj ## can convert between two character sets
  16. else:
  17. type
  18. CodePage = distinct int32
  19. EncodingConverter* = object
  20. dest, src: CodePage
  21. type
  22. EncodingError* = object of ValueError ## exception that is raised
  23. ## for encoding errors
  24. when defined(windows):
  25. proc eqEncodingNames(a, b: string): bool =
  26. var i = 0
  27. var j = 0
  28. while i < a.len and j < b.len:
  29. if a[i] in {'-', '_'}: inc i
  30. if b[j] in {'-', '_'}: inc j
  31. if i < a.len and j < b.len and a[i].toLowerAscii != b[j].toLowerAscii: return false
  32. inc i
  33. inc j
  34. result = i == a.len and j == b.len
  35. const
  36. winEncodings = [
  37. (1, "OEMCP"), # current OEM codepage
  38. (037, "IBM037"), # IBM EBCDIC US-Canada
  39. (437, "IBM437"), # OEM United States
  40. (500, "IBM500"), # IBM EBCDIC International
  41. (708, "ASMO-708"), # Arabic (ASMO 708)
  42. (709, "ASMO_449"), # Arabic (ASMO-449+, BCON V4)
  43. (710, ""), # Arabic - Transparent Arabic
  44. (720, "DOS-720"), # Arabic (Transparent ASMO); Arabic (DOS)
  45. (737, "ibm737"), # OEM Greek (formerly 437G); Greek (DOS)
  46. (775, "ibm775"), # OEM Baltic; Baltic (DOS)
  47. (850, "ibm850"), # OEM Multilingual Latin 1; Western European (DOS)
  48. (852, "ibm852"), # OEM Latin 2; Central European (DOS)
  49. (855, "IBM855"), # OEM Cyrillic (primarily Russian)
  50. (857, "ibm857"), # OEM Turkish; Turkish (DOS)
  51. (858, "IBM00858"), # OEM Multilingual Latin 1 + Euro symbol
  52. (860, "IBM860"), # OEM Portuguese; Portuguese (DOS)
  53. (861, "ibm861"), # OEM Icelandic; Icelandic (DOS)
  54. (862, "DOS-862"), # OEM Hebrew; Hebrew (DOS)
  55. (863, "IBM863"), # OEM French Canadian; French Canadian (DOS)
  56. (864, "IBM864"), # OEM Arabic; Arabic (864)
  57. (865, "IBM865"), # OEM Nordic; Nordic (DOS)
  58. (866, "cp866"), # OEM Russian; Cyrillic (DOS)
  59. (869, "ibm869"), # OEM Modern Greek; Greek, Modern (DOS)
  60. (870, "IBM870"), # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
  61. (874, "windows-874"), # ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows)
  62. (875, "cp875"), # IBM EBCDIC Greek Modern
  63. (932, "shift_jis"), # ANSI/OEM Japanese; Japanese (Shift-JIS)
  64. (936, "gb2312"), # ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
  65. (949, "ks_c_5601-1987"), # ANSI/OEM Korean (Unified Hangul Code)
  66. (950, "big5"), # ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
  67. (1026, "IBM1026"), # IBM EBCDIC Turkish (Latin 5)
  68. (1047, "IBM01047"), # IBM EBCDIC Latin 1/Open System
  69. (1140, "IBM01140"), # IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
  70. (1141, "IBM01141"), # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
  71. (1142, "IBM01142"), # IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
  72. (1143, "IBM01143"), # IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
  73. (1144, "IBM01144"), # IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
  74. (1145, "IBM01145"), # IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
  75. (1146, "IBM01146"), # IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
  76. (1147, "IBM01147"), # IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
  77. (1148, "IBM01148"), # IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
  78. (1149, "IBM01149"), # IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
  79. (1200, "utf-16"), # Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
  80. (1201, "unicodeFFFE"), # Unicode UTF-16, big endian byte order; available only to managed applications
  81. (1250, "windows-1250"), # ANSI Central European; Central European (Windows)
  82. (1251, "windows-1251"), # ANSI Cyrillic; Cyrillic (Windows)
  83. (1252, "windows-1252"), # ANSI Latin 1; Western European (Windows)
  84. (1253, "windows-1253"), # ANSI Greek; Greek (Windows)
  85. (1254, "windows-1254"), # ANSI Turkish; Turkish (Windows)
  86. (1255, "windows-1255"), # ANSI Hebrew; Hebrew (Windows)
  87. (1256, "windows-1256"), # ANSI Arabic; Arabic (Windows)
  88. (1257, "windows-1257"), # ANSI Baltic; Baltic (Windows)
  89. (1258, "windows-1258"), # ANSI/OEM Vietnamese; Vietnamese (Windows)
  90. (1250, "cp-1250"), # ANSI Central European; Central European (Windows)
  91. (1251, "cp-1251"), # ANSI Cyrillic; Cyrillic (Windows)
  92. (1252, "cp-1252"), # ANSI Latin 1; Western European (Windows)
  93. (1253, "cp-1253"), # ANSI Greek; Greek (Windows)
  94. (1254, "cp-1254"), # ANSI Turkish; Turkish (Windows)
  95. (1255, "cp-1255"), # ANSI Hebrew; Hebrew (Windows)
  96. (1256, "cp-1256"), # ANSI Arabic; Arabic (Windows)
  97. (1257, "cp-1257"), # ANSI Baltic; Baltic (Windows)
  98. (1258, "cp-1258"), # ANSI/OEM Vietnamese; Vietnamese (Windows)
  99. (1361, "Johab"), # Korean (Johab)
  100. (10000, "macintosh"), # MAC Roman; Western European (Mac)
  101. (10001, "x-mac-japanese"), # Japanese (Mac)
  102. (10002, "x-mac-chinesetrad"), # MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
  103. (10003, "x-mac-korean"), # Korean (Mac)
  104. (10004, "x-mac-arabic"), # Arabic (Mac)
  105. (10005, "x-mac-hebrew"), # Hebrew (Mac)
  106. (10006, "x-mac-greek"), # Greek (Mac)
  107. (10007, "x-mac-cyrillic"), # Cyrillic (Mac)
  108. (10008, "x-mac-chinesesimp"), # MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
  109. (10010, "x-mac-romanian"), # Romanian (Mac)
  110. (10017, "x-mac-ukrainian"), # Ukrainian (Mac)
  111. (10021, "x-mac-thai"), # Thai (Mac)
  112. (10029, "x-mac-ce"), # MAC Latin 2; Central European (Mac)
  113. (10079, "x-mac-icelandic"), # Icelandic (Mac)
  114. (10081, "x-mac-turkish"), # Turkish (Mac)
  115. (10082, "x-mac-croatian"), # Croatian (Mac)
  116. (12000, "utf-32"), # Unicode UTF-32, little endian byte order; available only to managed applications
  117. (12001, "utf-32BE"), # Unicode UTF-32, big endian byte order; available only to managed applications
  118. (20000, "x-Chinese_CNS"), # CNS Taiwan; Chinese Traditional (CNS)
  119. (20001, "x-cp20001"), # TCA Taiwan
  120. (20002, "x_Chinese-Eten"), # Eten Taiwan; Chinese Traditional (Eten)
  121. (20003, "x-cp20003"), # IBM5550 Taiwan
  122. (20004, "x-cp20004"), # TeleText Taiwan
  123. (20005, "x-cp20005"), # Wang Taiwan
  124. (20105, "x-IA5"), # IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
  125. (20106, "x-IA5-German"), # IA5 German (7-bit)
  126. (20107, "x-IA5-Swedish"), # IA5 Swedish (7-bit)
  127. (20108, "x-IA5-Norwegian"), # IA5 Norwegian (7-bit)
  128. (20127, "us-ascii"), # US-ASCII (7-bit)
  129. (20261, "x-cp20261"), # T.61
  130. (20269, "x-cp20269"), # ISO 6937 Non-Spacing Accent
  131. (20273, "IBM273"), # IBM EBCDIC Germany
  132. (20277, "IBM277"), # IBM EBCDIC Denmark-Norway
  133. (20278, "IBM278"), # IBM EBCDIC Finland-Sweden
  134. (20280, "IBM280"), # IBM EBCDIC Italy
  135. (20284, "IBM284"), # IBM EBCDIC Latin America-Spain
  136. (20285, "IBM285"), # IBM EBCDIC United Kingdom
  137. (20290, "IBM290"), # IBM EBCDIC Japanese Katakana Extended
  138. (20297, "IBM297"), # IBM EBCDIC France
  139. (20420, "IBM420"), # IBM EBCDIC Arabic
  140. (20423, "IBM423"), # IBM EBCDIC Greek
  141. (20424, "IBM424"), # IBM EBCDIC Hebrew
  142. (20833, "x-EBCDIC-KoreanExtended"), # IBM EBCDIC Korean Extended
  143. (20838, "IBM-Thai"), # IBM EBCDIC Thai
  144. (20866, "koi8-r"), # Russian (KOI8-R); Cyrillic (KOI8-R)
  145. (20871, "IBM871"), # IBM EBCDIC Icelandic
  146. (20880, "IBM880"), # IBM EBCDIC Cyrillic Russian
  147. (20905, "IBM905"), # IBM EBCDIC Turkish
  148. (20924, "IBM00924"), # IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
  149. (20932, "EUC-JP"), # Japanese (JIS 0208-1990 and 0121-1990)
  150. (20936, "x-cp20936"), # Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
  151. (20949, "x-cp20949"), # Korean Wansung
  152. (21025, "cp1025"), # IBM EBCDIC Cyrillic Serbian-Bulgarian
  153. (21027, ""), # (deprecated)
  154. (21866, "koi8-u"), # Ukrainian (KOI8-U); Cyrillic (KOI8-U)
  155. (28591, "iso-8859-1"), # ISO 8859-1 Latin 1; Western European (ISO)
  156. (28592, "iso-8859-2"), # ISO 8859-2 Central European; Central European (ISO)
  157. (28593, "iso-8859-3"), # ISO 8859-3 Latin 3
  158. (28594, "iso-8859-4"), # ISO 8859-4 Baltic
  159. (28595, "iso-8859-5"), # ISO 8859-5 Cyrillic
  160. (28596, "iso-8859-6"), # ISO 8859-6 Arabic
  161. (28597, "iso-8859-7"), # ISO 8859-7 Greek
  162. (28598, "iso-8859-8"), # ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
  163. (28599, "iso-8859-9"), # ISO 8859-9 Turkish
  164. (28603, "iso-8859-13"), # ISO 8859-13 Estonian
  165. (28605, "iso-8859-15"), # ISO 8859-15 Latin 9
  166. (29001, "x-Europa"), # Europa 3
  167. (38598, "iso-8859-8-i"), # ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
  168. (50220, "iso-2022-jp"), # ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
  169. (50221, "csISO2022JP"), # ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
  170. (50222, "iso-2022-jp"), # ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
  171. (50225, "iso-2022-kr"), # ISO 2022 Korean
  172. (50227, "x-cp50227"), # ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
  173. (50229, ""), # ISO 2022 Traditional Chinese
  174. (50930, ""), # EBCDIC Japanese (Katakana) Extended
  175. (50931, ""), # EBCDIC US-Canada and Japanese
  176. (50933, ""), # EBCDIC Korean Extended and Korean
  177. (50935, ""), # EBCDIC Simplified Chinese Extended and Simplified Chinese
  178. (50936, ""), # EBCDIC Simplified Chinese
  179. (50937, ""), # EBCDIC US-Canada and Traditional Chinese
  180. (50939, ""), # EBCDIC Japanese (Latin) Extended and Japanese
  181. (51932, "euc-jp"), # EUC Japanese
  182. (51936, "EUC-CN"), # EUC Simplified Chinese; Chinese Simplified (EUC)
  183. (51949, "euc-kr"), # EUC Korean
  184. (51950, ""), # EUC Traditional Chinese
  185. (52936, "hz-gb-2312"), # HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
  186. (54936, "GB18030"), # Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
  187. (57002, "x-iscii-de"), # ISCII Devanagari
  188. (57003, "x-iscii-be"), # ISCII Bengali
  189. (57004, "x-iscii-ta"), # ISCII Tamil
  190. (57005, "x-iscii-te"), # ISCII Telugu
  191. (57006, "x-iscii-as"), # ISCII Assamese
  192. (57007, "x-iscii-or"), # ISCII Oriya
  193. (57008, "x-iscii-ka"), # ISCII Kannada
  194. (57009, "x-iscii-ma"), # ISCII Malayalam
  195. (57010, "x-iscii-gu"), # ISCII Gujarati
  196. (57011, "x-iscii-pa"), # ISCII Punjabi
  197. (65000, "utf-7"), # Unicode (UTF-7)
  198. (65001, "utf-8")] # Unicode (UTF-8)
  199. when false:
  200. # not needed yet:
  201. type
  202. CpInfo = object
  203. maxCharSize: int32
  204. defaultChar: array[0..1, char]
  205. leadByte: array[0..12-1, char]
  206. {.deprecated: [TCpInfo: CpInfo].}
  207. proc getCPInfo(codePage: CodePage, lpCPInfo: var CpInfo): int32 {.
  208. stdcall, importc: "GetCPInfo", dynlib: "kernel32".}
  209. proc nameToCodePage(name: string): CodePage =
  210. var nameAsInt: int
  211. if parseInt(name, nameAsInt) == 0: nameAsInt = -1
  212. for no, na in items(winEncodings):
  213. if no == nameAsInt or eqEncodingNames(na, name): return CodePage(no)
  214. result = CodePage(-1)
  215. proc codePageToName(c: CodePage): string =
  216. for no, na in items(winEncodings):
  217. if no == int(c):
  218. return if na.len != 0: na else: $no
  219. result = ""
  220. proc getACP(): CodePage {.stdcall, importc: "GetACP", dynlib: "kernel32".}
  221. proc multiByteToWideChar(
  222. codePage: CodePage,
  223. dwFlags: int32,
  224. lpMultiByteStr: cstring,
  225. cbMultiByte: cint,
  226. lpWideCharStr: cstring,
  227. cchWideChar: cint): cint {.
  228. stdcall, importc: "MultiByteToWideChar", dynlib: "kernel32".}
  229. proc wideCharToMultiByte(
  230. codePage: CodePage,
  231. dwFlags: int32,
  232. lpWideCharStr: cstring,
  233. cchWideChar: cint,
  234. lpMultiByteStr: cstring,
  235. cbMultiByte: cint,
  236. lpDefaultChar: cstring=nil,
  237. lpUsedDefaultChar: pointer=nil): cint {.
  238. stdcall, importc: "WideCharToMultiByte", dynlib: "kernel32".}
  239. else:
  240. when defined(haiku):
  241. const iconvDll = "libiconv.so"
  242. elif defined(macosx):
  243. const iconvDll = "libiconv.dylib"
  244. else:
  245. const iconvDll = "(libc.so.6|libiconv.so)"
  246. const
  247. E2BIG = 7.cint
  248. EINVAL = 22.cint
  249. when defined(linux):
  250. const EILSEQ = 84.cint
  251. elif defined(macosx):
  252. const EILSEQ = 92.cint
  253. elif defined(bsd):
  254. const EILSEQ = 86.cint
  255. elif defined(solaris):
  256. const EILSEQ = 88.cint
  257. elif defined(haiku):
  258. const EILSEQ = -2147454938.cint
  259. var errno {.importc, header: "<errno.h>".}: cint
  260. when defined(freebsd) or defined(netbsd):
  261. {.pragma: importIconv, cdecl, header: "<iconv.h>".}
  262. else:
  263. {.pragma: importIconv, cdecl, dynlib: iconvDll.}
  264. proc iconvOpen(tocode, fromcode: cstring): EncodingConverter {.
  265. importc: "iconv_open", importIconv.}
  266. proc iconvClose(c: EncodingConverter) {.
  267. importc: "iconv_close", importIconv.}
  268. proc iconv(c: EncodingConverter, inbuf: var cstring, inbytesLeft: var int,
  269. outbuf: var cstring, outbytesLeft: var int): int {.
  270. importc: "iconv", importIconv.}
  271. proc iconv(c: EncodingConverter, inbuf: pointer, inbytesLeft: pointer,
  272. outbuf: var cstring, outbytesLeft: var int): int {.
  273. importc: "iconv", importIconv.}
  274. proc getCurrentEncoding*(): string =
  275. ## retrieves the current encoding. On Unix, always "UTF-8" is returned.
  276. when defined(windows):
  277. result = codePageToName(getACP())
  278. else:
  279. result = "UTF-8"
  280. proc open*(destEncoding = "UTF-8", srcEncoding = "CP1252"): EncodingConverter =
  281. ## opens a converter that can convert from `srcEncoding` to `destEncoding`.
  282. ## Raises `IOError` if it cannot fulfill the request.
  283. when not defined(windows):
  284. result = iconvOpen(destEncoding, srcEncoding)
  285. if result == nil:
  286. raise newException(EncodingError,
  287. "cannot create encoding converter from " &
  288. srcEncoding & " to " & destEncoding)
  289. else:
  290. result.dest = nameToCodePage(destEncoding)
  291. result.src = nameToCodePage(srcEncoding)
  292. if int(result.dest) == -1:
  293. raise newException(EncodingError,
  294. "cannot find encoding " & destEncoding)
  295. if int(result.src) == -1:
  296. raise newException(EncodingError,
  297. "cannot find encoding " & srcEncoding)
  298. proc close*(c: EncodingConverter) =
  299. ## frees the resources the converter `c` holds.
  300. when not defined(windows):
  301. iconvClose(c)
  302. when defined(windows):
  303. proc convert*(c: EncodingConverter, s: string): string =
  304. ## converts `s` to `destEncoding` that was given to the converter `c`. It
  305. ## assumed that `s` is in `srcEncoding`.
  306. # special case: empty string: needed because MultiByteToWideChar
  307. # return 0 in case of error:
  308. if s.len == 0: return ""
  309. # educated guess of capacity:
  310. var cap = s.len + s.len shr 2
  311. result = newString(cap*2)
  312. # convert to utf-16 LE
  313. var m = multiByteToWideChar(codePage = c.src, dwFlags = 0'i32,
  314. lpMultiByteStr = cstring(s),
  315. cbMultiByte = cint(s.len),
  316. lpWideCharStr = cstring(result),
  317. cchWideChar = cint(cap))
  318. if m == 0:
  319. # try again; ask for capacity:
  320. cap = multiByteToWideChar(codePage = c.src, dwFlags = 0'i32,
  321. lpMultiByteStr = cstring(s),
  322. cbMultiByte = cint(s.len),
  323. lpWideCharStr = nil,
  324. cchWideChar = cint(0))
  325. # and do the conversion properly:
  326. result = newString(cap*2)
  327. m = multiByteToWideChar(codePage = c.src, dwFlags = 0'i32,
  328. lpMultiByteStr = cstring(s),
  329. cbMultiByte = cint(s.len),
  330. lpWideCharStr = cstring(result),
  331. cchWideChar = cint(cap))
  332. if m == 0: raiseOSError(osLastError())
  333. setLen(result, m*2)
  334. elif m <= cap:
  335. setLen(result, m*2)
  336. else:
  337. assert(false) # cannot happen
  338. # if already utf-16 LE, no further need to do something:
  339. if int(c.dest) == 1200: return
  340. # otherwise the fun starts again:
  341. cap = s.len + s.len shr 2
  342. var res = newString(cap)
  343. m = wideCharToMultiByte(
  344. codePage = c.dest,
  345. dwFlags = 0'i32,
  346. lpWideCharStr = cstring(result),
  347. cchWideChar = cint(result.len div 2),
  348. lpMultiByteStr = cstring(res),
  349. cbMultiByte = cap.cint)
  350. if m == 0:
  351. # try again; ask for capacity:
  352. cap = wideCharToMultiByte(
  353. codePage = c.dest,
  354. dwFlags = 0'i32,
  355. lpWideCharStr = cstring(result),
  356. cchWideChar = cint(result.len div 2),
  357. lpMultiByteStr = nil,
  358. cbMultiByte = cint(0))
  359. # and do the conversion properly:
  360. res = newString(cap)
  361. m = wideCharToMultiByte(
  362. codePage = c.dest,
  363. dwFlags = 0'i32,
  364. lpWideCharStr = cstring(result),
  365. cchWideChar = cint(result.len div 2),
  366. lpMultiByteStr = cstring(res),
  367. cbMultiByte = cap.cint)
  368. if m == 0: raiseOSError(osLastError())
  369. setLen(res, m)
  370. result = res
  371. elif m <= cap:
  372. setLen(res, m)
  373. result = res
  374. else:
  375. assert(false) # cannot happen
  376. else:
  377. proc convert*(c: EncodingConverter, s: string): string =
  378. result = newString(s.len)
  379. var inLen = len(s)
  380. var outLen = len(result)
  381. var src = cstring(s)
  382. var dst = cstring(result)
  383. var iconvres: int
  384. while inLen > 0:
  385. iconvres = iconv(c, src, inLen, dst, outLen)
  386. if iconvres == -1:
  387. var lerr = errno
  388. if lerr == EILSEQ or lerr == EINVAL:
  389. # unknown char, skip
  390. dst[0] = src[0]
  391. src = cast[cstring](cast[int](src) + 1)
  392. dst = cast[cstring](cast[int](dst) + 1)
  393. dec(inLen)
  394. dec(outLen)
  395. elif lerr == E2BIG:
  396. var offset = cast[int](dst) - cast[int](cstring(result))
  397. setLen(result, len(result)+inLen*2+5)
  398. # 5 is minimally one utf-8 char
  399. dst = cast[cstring](cast[int](cstring(result)) + offset)
  400. outLen = len(result) - offset
  401. else:
  402. raiseOSError(lerr.OSErrorCode)
  403. # iconv has a buffer that needs flushing, specially if the last char is
  404. # not '\0'
  405. discard iconv(c, nil, nil, dst, outLen)
  406. if iconvres == cint(-1) and errno == E2BIG:
  407. var offset = cast[int](dst) - cast[int](cstring(result))
  408. setLen(result, len(result)+inLen*2+5)
  409. # 5 is minimally one utf-8 char
  410. dst = cast[cstring](cast[int](cstring(result)) + offset)
  411. outLen = len(result) - offset
  412. discard iconv(c, nil, nil, dst, outLen)
  413. # trim output buffer
  414. setLen(result, len(result) - outLen)
  415. proc convert*(s: string, destEncoding = "UTF-8",
  416. srcEncoding = "CP1252"): string =
  417. ## converts `s` to `destEncoding`. It assumed that `s` is in `srcEncoding`.
  418. ## This opens a converter, uses it and closes it again and is thus more
  419. ## convienent but also likely less efficient than re-using a converter.
  420. var c = open(destEncoding, srcEncoding)
  421. try:
  422. result = convert(c, s)
  423. finally:
  424. close(c)
  425. when not defined(testing) and isMainModule:
  426. let
  427. orig = "öäüß"
  428. cp1252 = convert(orig, "CP1252", "UTF-8")
  429. ibm850 = convert(cp1252, "ibm850", "CP1252")
  430. current = getCurrentEncoding()
  431. echo "Original string from source code: ", orig
  432. echo "Forced ibm850 encoding: ", ibm850
  433. echo "Current encoding: ", current
  434. echo "From ibm850 to current: ", convert(ibm850, current, "ibm850")