unicode-decomp.pl 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. #!/usr/bin/perl -w
  2. # unicode-decomp.pl - script to generate database for java.text.Collator
  3. # Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc.
  4. #
  5. # This file is part of libjava.
  6. #
  7. # This software is copyrighted work licensed under the terms of the
  8. # Libjava License. Please consult the file "LIBJAVA_LICENSE" for
  9. # details.
  10. # Code for reading UnicodeData.txt and generating the code for
  11. # gnu.java.lang.CharData. For now, the relevant Unicode definition files
  12. # are found in libjava/gnu/gcj/convert/.
  13. #
  14. # Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h>
  15. # where <UnicodeData.txt> is obtained from www.unicode.org (named
  16. # UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
  17. # is the final location of include/java-chardecomp.h.
  18. # As of JDK 1.4, use Unicode version 3.0.0 for best results.
  19. #
  20. # If this exits with nonzero status, then you must investigate the
  21. # cause of the problem.
  22. # Diagnostics and other information to stderr.
  23. # With -n, the files are not created, but all processing still occurs.
  24. # These maps characters to their decompositions.
  25. my %canonical_decomposition = ();
  26. my %full_decomposition = ();
  27. # Handle `-n' and open output files.
  28. if ($ARGV[0] && $ARGV[0] eq '-n')
  29. {
  30. shift @ARGV;
  31. $ARGV[1] = '/dev/null';
  32. }
  33. die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2;
  34. open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
  35. # Process the Unicode file.
  36. $| = 1;
  37. my $count = 0;
  38. print STDERR "Parsing attributes file";
  39. while (<UNICODE>)
  40. {
  41. print STDERR "." unless $count++ % 1000;
  42. chomp;
  43. s/\r//g;
  44. my ($ch, undef, undef, undef, undef, $decomp) = split ';';
  45. $ch = hex($ch);
  46. if ($decomp ne '')
  47. {
  48. my $is_full = 0;
  49. my @decomp = ();
  50. foreach (split (' ', $decomp))
  51. {
  52. if (/^\<.*\>$/)
  53. {
  54. $is_full = 1;
  55. next;
  56. }
  57. push (@decomp, hex ($_));
  58. }
  59. my $s = pack "n*", @decomp;
  60. if ($is_full)
  61. {
  62. $full_decomposition{$ch} = $s;
  63. }
  64. else
  65. {
  66. $canonical_decomposition{$ch} = $s;
  67. }
  68. }
  69. }
  70. # Now generate decomposition tables.
  71. open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n";
  72. print STDERR "\nGenerating tables\n";
  73. print DECOMP <<EOF;
  74. // java-chardecomp.h - Decomposition character tables -*- c++ -*-
  75. #ifndef __JAVA_CHARDECOMP_H__
  76. #define __JAVA_CHARDECOMP_H__
  77. // These tables are automatically generated by the $0
  78. // script. DO NOT EDIT the tables. Instead, fix the script
  79. // and run it again.
  80. // This file should only be included by natCollator.cc
  81. struct decomp_entry
  82. {
  83. jchar key;
  84. const char *value;
  85. };
  86. EOF
  87. &write_decompositions;
  88. print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";
  89. close(DECOMP);
  90. print STDERR "Done\n";
  91. exit;
  92. # Write a single decomposition table.
  93. sub write_single_decomposition($$%)
  94. {
  95. my ($name, $is_canon, %table) = @_;
  96. my $first_line = 1;
  97. print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";
  98. for my $key (0 .. 0xffff)
  99. {
  100. next if ! defined $table{$key};
  101. print DECOMP ",\n" unless $first_line;
  102. $first_line = 0;
  103. printf DECOMP " { 0x%04x, \"", $key;
  104. # We represent the expansion as a series of bytes, terminated
  105. # with a double nul. This is ugly, but relatively
  106. # space-efficient. Most expansions are short, but there are a
  107. # few that are very long (e.g. \uFDFA). This means that if we
  108. # chose a fixed-space representation we would waste a lot of
  109. # space.
  110. my @expansion = unpack "n*", $table{$key};
  111. foreach my $char (@expansion)
  112. {
  113. printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
  114. }
  115. print DECOMP "\" }";
  116. }
  117. print DECOMP "\n};\n\n";
  118. }
  119. sub write_decompositions()
  120. {
  121. &write_single_decomposition ('canonical', 1, %canonical_decomposition);
  122. &write_single_decomposition ('full', 0, %full_decomposition);
  123. }