gengb18030tables.pl 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. #!/usr/local/bin/perl
  2. # -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
  3. #
  4. # This Source Code Form is subject to the terms of the Mozilla Public
  5. # License, v. 2.0. If a copy of the MPL was not distributed with this
  6. # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7. %gb18030tounicode = {};
  8. %unicodetogb18030 = {};
  9. %unicodetocp936 = {};
  10. %cp936tounicode = {};
  11. %tounicodecommon = {};
  12. %gb18030tounicodeuniq = {};
  13. %gb180304btounicode = {};
  14. %cp936tounicodeuniq = {};
  15. %map = {};
  16. $rowwidth = ((0xff - 0x80)+(0x7f - 0x40));
  17. sub cp936tonum()
  18. {
  19. my($cp936) = (@_);
  20. my($first,$second,$jnum);
  21. $first = hex(substr($cp936,2,2));
  22. $second = hex(substr($cp936,4,2));
  23. $jnum = ($first - 0x81 ) * $rowwidth;
  24. if($second >= 0x80)
  25. {
  26. $jnum += $second - 0x80 + (0x7f-0x40);
  27. }
  28. else
  29. {
  30. $jnum += $second - 0x40;
  31. }
  32. return $jnum;
  33. }
  34. sub addeudc()
  35. {
  36. my($l,$h,$hl,$us);
  37. $u = 0xE000;
  38. $us = sprintf "%04X", $u;
  39. # For AAA1-AFFE
  40. for($h=0xAA; $h <=0xAF;$h++)
  41. {
  42. for($l=0xA1; $l <=0xFE;$l++,$u++)
  43. {
  44. $us = sprintf "%04X", $u;
  45. $hl = sprintf "%02X%02X", $h, $l;
  46. $unicodetocp936{$us} = $hl;
  47. }
  48. }
  49. # For F8A1-FEFE
  50. $us = sprintf "%04X", $u;
  51. for($h=0xF8; $h <=0xFE;$h++)
  52. {
  53. for($l=0xA1; $l <=0xFE;$l++,$u++)
  54. {
  55. $us = sprintf "%04X", $u;
  56. $hl = sprintf "%02X%02X", $h, $l;
  57. $unicodetocp936{$us} = $hl;
  58. }
  59. }
  60. # For A140-A7A0
  61. $us = sprintf "%04X", $u;
  62. for($h=0xA1; $h <=0xA7;$h++)
  63. {
  64. for($l=0x40; $l <=0x7E;$l++,$u++)
  65. {
  66. $us = sprintf "%04X", $u;
  67. $hl = sprintf "%02X%02X", $h, $l;
  68. $unicodetocp936{$us} = $hl;
  69. }
  70. # We need to skip 7F
  71. for($l=0x80; $l <=0xA0;$l++,$u++)
  72. {
  73. $us = sprintf "%04X", $u;
  74. $hl = sprintf "%02X%02X", $h, $l;
  75. $unicodetocp936{$us} = $hl;
  76. }
  77. }
  78. }
  79. sub readcp936()
  80. {
  81. open(CP936, "<CP936.txt") || die "Cannot open CP936 file";
  82. while(<CP936>)
  83. {
  84. if(! /^#/) {
  85. chop();
  86. ($gb, $u) = split(/\t/, $_);
  87. if($u =~ /^0x/) {
  88. $u1 = substr($u, 2, 4);
  89. $gb1 = substr($gb, 2, 4);
  90. $cp936tounicode{$gb1} = $u1;
  91. if($unicodetocp936{$u1} == "") {
  92. $unicodetocp936{$u1} = $gb1;
  93. } else {
  94. "WARNING: Unicode " . $u1 . " already map to CP936 " .
  95. $unicodetocp936{$u1} . " when we try to map to " . $gb1 . "\n";
  96. }
  97. }
  98. }
  99. }
  100. }
  101. sub readgb18030()
  102. {
  103. open(GB18030, "<GB18030") || die "Cannot open GB18030 file";
  104. while(<GB18030>)
  105. {
  106. if(/^[0-9A-F]/) {
  107. chop();
  108. ($u, $gb) = split(/\s/, $_);
  109. $gb18030tounicode{$gb} = $u;
  110. if( $unicodetogb18030{$u} == "" ) {
  111. $unicodetogb18030{$u} = $gb;
  112. } else {
  113. "WARNING: Unicode " . $u1 . " already map to CP936 " .
  114. $unicodetocp936{$u1} . " when we try to map to " . $gb1 . "\n";
  115. }
  116. }
  117. }
  118. }
  119. sub splittable()
  120. {
  121. my($i, $u);
  122. for($i = 0; $i < 0x10000; $i++) {
  123. $u = sprintf "%04X", $i;
  124. if($unicodetogb18030{$u} eq $unicodetocp936{$u}) {
  125. if($unicodetogb18030{$u} ne "") {
  126. $tounicodecommon{$unicodetogb18030{$u}} = $u;
  127. } else {
  128. # print $u . "|" . $unicodetogb18030{$u} . "|" . $unicodetocp936{$u} . "\n";
  129. }
  130. } else {
  131. if($unicodetogb18030{$u} ne "" ) {
  132. if($unicodetogb18030{$u}.length > 4) {
  133. $gb180304btounicode{$unicodetogb18030{$u}} = $u;
  134. } else {
  135. $gb18030tounicodeuniq{$unicodetogb18030{$u}} = $u;
  136. }
  137. }
  138. if($unicodetocp936{$u} ne "" ) {
  139. $cp936tounicodeuniq{$unicodetocp936{$u}} = $u;
  140. }
  141. }
  142. }
  143. }
  144. sub gb4bytestoidx()
  145. {
  146. my($gb) = @_;
  147. my($b1,$b2, $b3, $b4,$idx);
  148. $b1 = hex(substr($gb, 0, 2)) - 0x81;
  149. $b2 = hex(substr($gb, 2, 2)) - 0x30;
  150. $b3 = hex(substr($gb, 4, 2)) - 0x81;
  151. $b4 = hex(substr($gb, 6, 2)) - 0x30;
  152. $idx = sprintf "%04X" , ((($b1 * 10) + $b2 ) * 126 + $b3) * 10 + $b4;
  153. return $idx;
  154. }
  155. sub printcommontable()
  156. {
  157. open ( GBKCOMMON, ">gbkcommon.txt" ) || die "cannot open gbkcommon.txt";
  158. foreach $gb (sort(keys %tounicodecommon)) {
  159. print GBKCOMMON "0x" . $gb . "\t0x" . $tounicodecommon{$gb} . "\n";
  160. }
  161. close GBKCOMMON;
  162. }
  163. sub printcp936table()
  164. {
  165. open ( CP936UNIQ, ">cp936uniq.txt" ) || die "cannot open cp936uniq.txt";
  166. foreach $gb (sort(keys %cp936tounicodeuniq)) {
  167. print CP936UNIQ "0x" . $gb . "\t0x" . $cp936tounicodeuniq{$gb} . "\n";
  168. }
  169. close CP936UNIQ;
  170. }
  171. sub printgb180304btable()
  172. {
  173. open ( GB180304B, ">gb180304b.txt" ) || die "cannot open gb180304b.txt";
  174. foreach $gb (sort(keys %gb180304btounicode)) {
  175. if($gb180304btounicode{$gb} ne "FFFF" ) {
  176. print GB180304B "0x" . &gb4bytestoidx($gb) . "\t0x" . $gb180304btounicode{$gb} . "\t# 0x" . $gb . "\n";
  177. }
  178. }
  179. close GB180304B;
  180. }
  181. sub printgb18030table()
  182. {
  183. open ( GB18030UNIQ, ">gb18030uniq.txt" ) || die "cannot open gb18030uniq.txt";
  184. foreach $gb (sort(keys %gb18030tounicodeuniq)) {
  185. print GB18030UNIQ "0x" . $gb . "\t0x" . $gb18030tounicodeuniq{$gb} . "\n";
  186. }
  187. close GB18030UNIQ;
  188. }
  189. sub genufut()
  190. {
  191. print ( "umaptable -uf < gb18030uniq.txt > gb18030uniq2b.uf\n");
  192. system( "umaptable -uf < gb18030uniq.txt > gb18030uniq2b.uf");
  193. print ( "umaptable -ut < gb18030uniq.txt > gb18030uniq2b.ut\n");
  194. system( "umaptable -ut < gb18030uniq.txt > gb18030uniq2b.ut");
  195. print ( "umaptable -uf < cp936uniq.txt > gbkuniq.uf\n") ;
  196. system( "umaptable -uf < cp936uniq.txt > gbkuniq.uf") ;
  197. print ( "umaptable -ut < cp936uniq.txt > gbkuniq.ut\n") ;
  198. system( "umaptable -ut < cp936uniq.txt > gbkuniq.ut") ;
  199. print ( "umaptable -uf < gb180304b.txt > gb180304bytes.uf\n") ;
  200. system( "umaptable -uf < gb180304b.txt > gb180304bytes.uf") ;
  201. print ( "umaptable -ut < gb180304b.txt > gb180304bytes.ut\n") ;
  202. system( "umaptable -ut < gb180304b.txt > gb180304bytes.ut") ;
  203. print ( "perl cp936tocdx.pl > cp936map.h\n");
  204. system( "perl cp936tocdx.pl > cp936map.h");
  205. }
  206. &readgb18030();
  207. &readcp936();
  208. &addeudc();
  209. &splittable();
  210. &printcommontable();
  211. &printgb180304btable();
  212. &printgb18030table();
  213. &printcp936table();
  214. &genufut();