123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147 |
- #!/usr/bin/perl -w
- # unicode-decomp.pl - script to generate database for java.text.Collator
- # Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc.
- #
- # This file is part of libjava.
- #
- # This software is copyrighted work licensed under the terms of the
- # Libjava License. Please consult the file "LIBJAVA_LICENSE" for
- # details.
- # Code for reading UnicodeData.txt and generating the code for
- # gnu.java.lang.CharData. For now, the relevant Unicode definition files
- # are found in libjava/gnu/gcj/convert/.
- #
- # Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h>
- # where <UnicodeData.txt> is obtained from www.unicode.org (named
- # UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
- # is the final location of include/java-chardecomp.h.
- # As of JDK 1.4, use Unicode version 3.0.0 for best results.
- #
- # If this exits with nonzero status, then you must investigate the
- # cause of the problem.
- # Diagnostics and other information to stderr.
- # With -n, the files are not created, but all processing still occurs.
- # These maps characters to their decompositions.
- my %canonical_decomposition = ();
- my %full_decomposition = ();
- # Handle `-n' and open output files.
- if ($ARGV[0] && $ARGV[0] eq '-n')
- {
- shift @ARGV;
- $ARGV[1] = '/dev/null';
- }
- die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2;
- open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
- # Process the Unicode file.
- $| = 1;
- my $count = 0;
- print STDERR "Parsing attributes file";
- while (<UNICODE>)
- {
- print STDERR "." unless $count++ % 1000;
- chomp;
- s/\r//g;
- my ($ch, undef, undef, undef, undef, $decomp) = split ';';
- $ch = hex($ch);
- if ($decomp ne '')
- {
- my $is_full = 0;
- my @decomp = ();
- foreach (split (' ', $decomp))
- {
- if (/^\<.*\>$/)
- {
- $is_full = 1;
- next;
- }
- push (@decomp, hex ($_));
- }
- my $s = pack "n*", @decomp;
- if ($is_full)
- {
- $full_decomposition{$ch} = $s;
- }
- else
- {
- $canonical_decomposition{$ch} = $s;
- }
- }
- }
- # Now generate decomposition tables.
- open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n";
- print STDERR "\nGenerating tables\n";
- print DECOMP <<EOF;
- // java-chardecomp.h - Decomposition character tables -*- c++ -*-
- #ifndef __JAVA_CHARDECOMP_H__
- #define __JAVA_CHARDECOMP_H__
- // These tables are automatically generated by the $0
- // script. DO NOT EDIT the tables. Instead, fix the script
- // and run it again.
- // This file should only be included by natCollator.cc
- struct decomp_entry
- {
- jchar key;
- const char *value;
- };
- EOF
- &write_decompositions;
- print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";
- close(DECOMP);
- print STDERR "Done\n";
- exit;
- # Write a single decomposition table.
- sub write_single_decomposition($$%)
- {
- my ($name, $is_canon, %table) = @_;
- my $first_line = 1;
- print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";
- for my $key (0 .. 0xffff)
- {
- next if ! defined $table{$key};
- print DECOMP ",\n" unless $first_line;
- $first_line = 0;
- printf DECOMP " { 0x%04x, \"", $key;
- # We represent the expansion as a series of bytes, terminated
- # with a double nul. This is ugly, but relatively
- # space-efficient. Most expansions are short, but there are a
- # few that are very long (e.g. \uFDFA). This means that if we
- # chose a fixed-space representation we would waste a lot of
- # space.
- my @expansion = unpack "n*", $table{$key};
- foreach my $char (@expansion)
- {
- printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
- }
- print DECOMP "\" }";
- }
- print DECOMP "\n};\n\n";
- }
- sub write_decompositions()
- {
- &write_single_decomposition ('canonical', 1, %canonical_decomposition);
- &write_single_decomposition ('full', 0, %full_decomposition);
- }
|