123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357 |
- #!/usr/bin/perl
- #
- # This Source Code Form is subject to the terms of the Mozilla Public
- # License, v. 2.0. If a copy of the MPL was not distributed with this
- # file, You can obtain one at http://mozilla.org/MPL/2.0/.
- ######################################################################
- #
- # Initial global variable
- #
- ######################################################################
- %utot = ();
- $ui=0;
- $li=0;
- ######################################################################
- #
- # Open the unicode database file
- #
- ######################################################################
- open ( UNICODATA , "< ../../unicharutil/tools/UnicodeData-Latest.txt")
- || die "cannot find UnicodeData-Latest.txt";
- ######################################################################
- #
- # Open the JIS X 4051 Class file
- #
- ######################################################################
- open ( CLASS , "< jisx4051class.txt")
- || die "cannot find jisx4051class.txt";
- ######################################################################
- #
- # Open the JIS X 4051 Class simplified mapping
- #
- ######################################################################
- open ( SIMP , "< jisx4051simp.txt")
- || die "cannot find jisx4051simp.txt";
- ######################################################################
- #
- # Open the output file
- #
- ######################################################################
- open ( OUT , "> anzx4051.html")
- || die "cannot open output anzx4051.html file";
- ######################################################################
- #
- # Open the output file
- #
- ######################################################################
- open ( HEADER , "> ../src/jisx4051class.h")
- || die "cannot open output ../src/jisx4051class.h file";
- ######################################################################
- #
- # Generate license and header
- #
- ######################################################################
- $hthmlheader = <<END_OF_HTML;
- <!-- This Source Code Form is subject to the terms of the Mozilla Public
- - License, v. 2.0. If a copy of the MPL was not distributed with this
- - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->
- <HTML>
- <HEAD>
- <TITLE>
- Analysis of JIS X 4051 to Unicode General Category Mapping
- </TITLE>
- </HEAD>
- <BODY>
- <H1>
- Analysis of JIS X 4051 to Unicode General Category Mapping
- </H1>
- END_OF_HTML
- print OUT $hthmlheader;
- ######################################################################
- #
- # Generate license and header
- #
- ######################################################################
- $npl = <<END_OF_NPL;
- /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
- /* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
- /*
- DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY
- mozilla/intl/lwbrk/tools/anzx4051.pl
- */
- END_OF_NPL
- print HEADER $npl;
- %occ = ();
- %gcat = ();
- %dcat = ();
- %simp = ();
- %gcount = ();
- %dcount = ();
- %sccount = ();
- %rangecount = ();
- ######################################################################
- #
- # Process the file line by line
- #
- ######################################################################
- while(<UNICODATA>) {
- chop;
- ######################################################################
- #
- # Get value from fields
- #
- ######################################################################
- @f = split(/;/ , $_);
- $c = $f[0]; # The unicode value
- $g = $f[2];
- $d = substr($g, 0, 1);
- $gcat{$c} = $g;
- $dcat{$c} = $d;
- $gcount{$g}++;
- $dcount{$d}++;
- }
- close(UNIDATA);
- while(<SIMP>) {
- chop;
- ######################################################################
- #
- # Get value from fields
- #
- ######################################################################
- @f = split(/;/ , $_);
- $simp{$f[0]} = $f[1];
- $sccount{$f[1]}++;
- }
- close(SIMP);
- sub GetClass{
- my ($u) = @_;
- my $hex = DecToHex($u);
- $g = $gcat{$hex};
- if($g ne "") {
- return $g;
- } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) {
- return "Han";
- } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) {
- return "Lo";
- } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) {
- return "Cs";
- } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) {
- return "Cs";
- } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) {
- return "Cs";
- } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) {
- return "Co";
- } else {
- printf "WARNING !!!! Cannot find General Category for U+%s \n" , $hex;
- }
- }
- sub GetDClass{
- my ($u) = @_;
- my $hex = DecToHex($u);
- $g = $dcat{$hex};
- if($g ne "") {
- return $g;
- } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) {
- return "Han";
- } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) {
- return "L";
- } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) {
- return "C";
- } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) {
- return "C";
- } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) {
- return "C";
- } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) {
- return "C";
- } else {
- printf "WARNING !!!! Cannot find Detailed General Category for U+%s \n" , $hex;
- }
- }
- sub DecToHex{
- my ($d) = @_;
- return sprintf("%04X", $d);
- }
- %gtotal = ();
- %dtotal = ();
- while(<CLASS>) {
- chop;
- ######################################################################
- #
- # Get value from fields
- #
- ######################################################################
- @f = split(/;/ , $_);
- if( substr($f[2], 0, 1) ne "a")
- {
- $sc = $simp{$f[2]};
- $l = hex($f[0]);
- if($f[1] eq "")
- {
- $h = $l;
- } else {
- $h = hex($f[1]);
- }
- for($k = $l; $k <= $h ; $k++)
- {
- if( exists($occ{$k}))
- {
- # printf "WARNING !! Conflict defination!!! U+%s -> [%s] [%s | %s]\n",
- # DecToHex($k), $occ{$k} , $f[2] , $sc;
- }
- else
- {
- $occ{$k} = $sc . " | " . $f[2];
- $gclass = GetClass($k);
- $dclass = GetDClass($k);
- $gtotal{$sc . $gclass}++;
- $dtotal{$sc . $dclass}++;
- $u = DecToHex($k);
- $rk = " " . substr($u,0,2) . ":" . $sc;
- $rangecount{$rk}++;
- }
- }
- }
- }
- #print %gtotal;
- #print %dtotal;
- sub printreport
- {
- print OUT "<TABLE BORDER=3>\n";
- print OUT "<TR BGCOLOR=blue><TH><TH>\n";
-
- foreach $d (sort(keys %dcount)) {
- print OUT "<TD BGCOLOR=red>$d</TD>\n";
- }
-
- print OUT "<TD BGCOLOR=white>Total</TD>\n";
- foreach $g (sort(keys %gcount)) {
- print OUT "<TD BGCOLOR=yellow>$g</TD>\n";
- }
- print OUT "</TR>\n";
- foreach $sc (sort(keys %sccount)) {
-
- print OUT "<TR><TH>$sc<TH>\n";
-
- $total = 0;
- foreach $d (sort (keys %dcount)) {
- $count = $dtotal{$sc . $d};
- $total += $count;
- print OUT "<TD>$count</TD>\n";
- }
-
- print OUT "<TD BGCOLOR=white>$total</TD>\n";
-
- foreach $g (sort(keys %gcount)) {
- $count = $gtotal{$sc . $g};
- print OUT "<TD>$count</TD>\n";
- }
-
-
- print OUT "</TR>\n";
- }
- print OUT "</TABLE>\n";
-
-
- print OUT "<TABLE BORDER=3>\n";
- print OUT "<TR BGCOLOR=blue><TH><TH>\n";
-
- foreach $sc (sort(keys %sccount))
- {
- print OUT "<TD BGCOLOR=red>$sc</TD>\n";
- }
-
- print OUT "</TR>\n";
-
-
- for($rr = 0; $rr < 0x4f; $rr++)
- {
- $empty = 0;
- $r = sprintf("%02X" , $rr) ;
- $tmp = "<TR><TH>" . $r . "<TH>\n";
-
- foreach $sc (sort(keys %sccount)) {
- $count = $rangecount{ " " .$r . ":" .$sc};
- $tmp .= sprintf("<TD>%s</TD>\n", $count);
- $empty += $count;
- }
-
- $tmp .= "</TR>\n";
-
- if($empty ne 0)
- {
- print OUT $tmp;
- }
- }
- print OUT "</TABLE>\n";
-
- }
- printreport();
- sub printarray
- {
- my($r, $def) = @_;
- printf "[%s || %s]\n", $r, $def;
- $k = hex($r) * 256;
- printf HEADER "static const uint32_t gLBClass%s[32] = {\n", $r;
- for($i = 0 ; $i < 256; $i+= 8)
- {
- for($j = 7 ; $j >= 0; $j-- )
- {
- $v = $k + $i + $j;
- if( exists($occ{$v}))
- {
- $p = substr($occ{$v}, 1,1);
- } else {
- $p = $def;
- }
- if($j eq 7 )
- {
- printf HEADER "0x%s" , $p;
- } else {
- printf HEADER "%s", $p ;
- }
- }
- printf HEADER ", // U+%04X - U+%04X\n", $k + $i ,( $k + $i + 7);
- }
- print HEADER "};\n\n";
- }
- printarray("00", "7");
- printarray("20", "7");
- printarray("21", "7");
- printarray("30", "5");
- printarray("0E", "8");
- printarray("17", "7");
- #print %rangecount;
- ######################################################################
- #
- # Close files
- #
- ######################################################################
- close(HEADER);
- close(CLASS);
- close(OUT);
|