anzx4051.pl 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. #!/usr/bin/perl
  2. #
  3. # This Source Code Form is subject to the terms of the Mozilla Public
  4. # License, v. 2.0. If a copy of the MPL was not distributed with this
  5. # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  6. ######################################################################
  7. #
  8. # Initial global variable
  9. #
  10. ######################################################################
  11. %utot = ();
  12. $ui=0;
  13. $li=0;
  14. ######################################################################
  15. #
  16. # Open the unicode database file
  17. #
  18. ######################################################################
  19. open ( UNICODATA , "< ../../unicharutil/tools/UnicodeData-Latest.txt")
  20. || die "cannot find UnicodeData-Latest.txt";
  21. ######################################################################
  22. #
  23. # Open the JIS X 4051 Class file
  24. #
  25. ######################################################################
  26. open ( CLASS , "< jisx4051class.txt")
  27. || die "cannot find jisx4051class.txt";
  28. ######################################################################
  29. #
  30. # Open the JIS X 4051 Class simplified mapping
  31. #
  32. ######################################################################
  33. open ( SIMP , "< jisx4051simp.txt")
  34. || die "cannot find jisx4051simp.txt";
  35. ######################################################################
  36. #
  37. # Open the output file
  38. #
  39. ######################################################################
  40. open ( OUT , "> anzx4051.html")
  41. || die "cannot open output anzx4051.html file";
  42. ######################################################################
  43. #
  44. # Open the output file
  45. #
  46. ######################################################################
  47. open ( HEADER , "> ../src/jisx4051class.h")
  48. || die "cannot open output ../src/jisx4051class.h file";
  49. ######################################################################
  50. #
  51. # Generate license and header
  52. #
  53. ######################################################################
  54. $hthmlheader = <<END_OF_HTML;
  55. <!-- This Source Code Form is subject to the terms of the Mozilla Public
  56. - License, v. 2.0. If a copy of the MPL was not distributed with this
  57. - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->
  58. <HTML>
  59. <HEAD>
  60. <TITLE>
  61. Analysis of JIS X 4051 to Unicode General Category Mapping
  62. </TITLE>
  63. </HEAD>
  64. <BODY>
  65. <H1>
  66. Analysis of JIS X 4051 to Unicode General Category Mapping
  67. </H1>
  68. END_OF_HTML
  69. print OUT $hthmlheader;
  70. ######################################################################
  71. #
  72. # Generate license and header
  73. #
  74. ######################################################################
  75. $npl = <<END_OF_NPL;
  76. /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  77. /* This Source Code Form is subject to the terms of the Mozilla Public
  78. * License, v. 2.0. If a copy of the MPL was not distributed with this
  79. * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
  80. /*
  81. DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY
  82. mozilla/intl/lwbrk/tools/anzx4051.pl
  83. */
  84. END_OF_NPL
  85. print HEADER $npl;
  86. %occ = ();
  87. %gcat = ();
  88. %dcat = ();
  89. %simp = ();
  90. %gcount = ();
  91. %dcount = ();
  92. %sccount = ();
  93. %rangecount = ();
  94. ######################################################################
  95. #
  96. # Process the file line by line
  97. #
  98. ######################################################################
  99. while(<UNICODATA>) {
  100. chop;
  101. ######################################################################
  102. #
  103. # Get value from fields
  104. #
  105. ######################################################################
  106. @f = split(/;/ , $_);
  107. $c = $f[0]; # The unicode value
  108. $g = $f[2];
  109. $d = substr($g, 0, 1);
  110. $gcat{$c} = $g;
  111. $dcat{$c} = $d;
  112. $gcount{$g}++;
  113. $dcount{$d}++;
  114. }
  115. close(UNIDATA);
  116. while(<SIMP>) {
  117. chop;
  118. ######################################################################
  119. #
  120. # Get value from fields
  121. #
  122. ######################################################################
  123. @f = split(/;/ , $_);
  124. $simp{$f[0]} = $f[1];
  125. $sccount{$f[1]}++;
  126. }
  127. close(SIMP);
  128. sub GetClass{
  129. my ($u) = @_;
  130. my $hex = DecToHex($u);
  131. $g = $gcat{$hex};
  132. if($g ne "") {
  133. return $g;
  134. } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) {
  135. return "Han";
  136. } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) {
  137. return "Lo";
  138. } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) {
  139. return "Cs";
  140. } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) {
  141. return "Cs";
  142. } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) {
  143. return "Cs";
  144. } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) {
  145. return "Co";
  146. } else {
  147. printf "WARNING !!!! Cannot find General Category for U+%s \n" , $hex;
  148. }
  149. }
  150. sub GetDClass{
  151. my ($u) = @_;
  152. my $hex = DecToHex($u);
  153. $g = $dcat{$hex};
  154. if($g ne "") {
  155. return $g;
  156. } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) {
  157. return "Han";
  158. } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) {
  159. return "L";
  160. } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) {
  161. return "C";
  162. } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) {
  163. return "C";
  164. } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) {
  165. return "C";
  166. } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) {
  167. return "C";
  168. } else {
  169. printf "WARNING !!!! Cannot find Detailed General Category for U+%s \n" , $hex;
  170. }
  171. }
  172. sub DecToHex{
  173. my ($d) = @_;
  174. return sprintf("%04X", $d);
  175. }
  176. %gtotal = ();
  177. %dtotal = ();
  178. while(<CLASS>) {
  179. chop;
  180. ######################################################################
  181. #
  182. # Get value from fields
  183. #
  184. ######################################################################
  185. @f = split(/;/ , $_);
  186. if( substr($f[2], 0, 1) ne "a")
  187. {
  188. $sc = $simp{$f[2]};
  189. $l = hex($f[0]);
  190. if($f[1] eq "")
  191. {
  192. $h = $l;
  193. } else {
  194. $h = hex($f[1]);
  195. }
  196. for($k = $l; $k <= $h ; $k++)
  197. {
  198. if( exists($occ{$k}))
  199. {
  200. # printf "WARNING !! Conflict defination!!! U+%s -> [%s] [%s | %s]\n",
  201. # DecToHex($k), $occ{$k} , $f[2] , $sc;
  202. }
  203. else
  204. {
  205. $occ{$k} = $sc . " | " . $f[2];
  206. $gclass = GetClass($k);
  207. $dclass = GetDClass($k);
  208. $gtotal{$sc . $gclass}++;
  209. $dtotal{$sc . $dclass}++;
  210. $u = DecToHex($k);
  211. $rk = " " . substr($u,0,2) . ":" . $sc;
  212. $rangecount{$rk}++;
  213. }
  214. }
  215. }
  216. }
  217. #print %gtotal;
  218. #print %dtotal;
  219. sub printreport
  220. {
  221. print OUT "<TABLE BORDER=3>\n";
  222. print OUT "<TR BGCOLOR=blue><TH><TH>\n";
  223. foreach $d (sort(keys %dcount)) {
  224. print OUT "<TD BGCOLOR=red>$d</TD>\n";
  225. }
  226. print OUT "<TD BGCOLOR=white>Total</TD>\n";
  227. foreach $g (sort(keys %gcount)) {
  228. print OUT "<TD BGCOLOR=yellow>$g</TD>\n";
  229. }
  230. print OUT "</TR>\n";
  231. foreach $sc (sort(keys %sccount)) {
  232. print OUT "<TR><TH>$sc<TH>\n";
  233. $total = 0;
  234. foreach $d (sort (keys %dcount)) {
  235. $count = $dtotal{$sc . $d};
  236. $total += $count;
  237. print OUT "<TD>$count</TD>\n";
  238. }
  239. print OUT "<TD BGCOLOR=white>$total</TD>\n";
  240. foreach $g (sort(keys %gcount)) {
  241. $count = $gtotal{$sc . $g};
  242. print OUT "<TD>$count</TD>\n";
  243. }
  244. print OUT "</TR>\n";
  245. }
  246. print OUT "</TABLE>\n";
  247. print OUT "<TABLE BORDER=3>\n";
  248. print OUT "<TR BGCOLOR=blue><TH><TH>\n";
  249. foreach $sc (sort(keys %sccount))
  250. {
  251. print OUT "<TD BGCOLOR=red>$sc</TD>\n";
  252. }
  253. print OUT "</TR>\n";
  254. for($rr = 0; $rr < 0x4f; $rr++)
  255. {
  256. $empty = 0;
  257. $r = sprintf("%02X" , $rr) ;
  258. $tmp = "<TR><TH>" . $r . "<TH>\n";
  259. foreach $sc (sort(keys %sccount)) {
  260. $count = $rangecount{ " " .$r . ":" .$sc};
  261. $tmp .= sprintf("<TD>%s</TD>\n", $count);
  262. $empty += $count;
  263. }
  264. $tmp .= "</TR>\n";
  265. if($empty ne 0)
  266. {
  267. print OUT $tmp;
  268. }
  269. }
  270. print OUT "</TABLE>\n";
  271. }
  272. printreport();
  273. sub printarray
  274. {
  275. my($r, $def) = @_;
  276. printf "[%s || %s]\n", $r, $def;
  277. $k = hex($r) * 256;
  278. printf HEADER "static const uint32_t gLBClass%s[32] = {\n", $r;
  279. for($i = 0 ; $i < 256; $i+= 8)
  280. {
  281. for($j = 7 ; $j >= 0; $j-- )
  282. {
  283. $v = $k + $i + $j;
  284. if( exists($occ{$v}))
  285. {
  286. $p = substr($occ{$v}, 1,1);
  287. } else {
  288. $p = $def;
  289. }
  290. if($j eq 7 )
  291. {
  292. printf HEADER "0x%s" , $p;
  293. } else {
  294. printf HEADER "%s", $p ;
  295. }
  296. }
  297. printf HEADER ", // U+%04X - U+%04X\n", $k + $i ,( $k + $i + 7);
  298. }
  299. print HEADER "};\n\n";
  300. }
  301. printarray("00", "7");
  302. printarray("20", "7");
  303. printarray("21", "7");
  304. printarray("30", "5");
  305. printarray("0E", "8");
  306. printarray("17", "7");
  307. #print %rangecount;
  308. ######################################################################
  309. #
  310. # Close files
  311. #
  312. ######################################################################
  313. close(HEADER);
  314. close(CLASS);
  315. close(OUT);