genSpecialCasingData.pl 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. #!/usr/bin/env perl
  2. # This Source Code Form is subject to the terms of the Mozilla Public
  3. # License, v. 2.0. If a copy of the MPL was not distributed with this file,
  4. # You can obtain one at http://mozilla.org/MPL/2.0/.
  5. # This tool is used to extract "special" (one-to-many) case mappings
  6. # into a form that can be used by nsTextRunTransformations.
  7. use strict;
  8. if ($#ARGV != 1) {
  9. print <<__EOT;
  10. # Run this tool using a command line of the form
  11. #
  12. # perl genSpecialCasingData.pl UnicodeData.txt SpecialCasing.txt
  13. #
  14. # The nsSpecialCasingData.cpp file will be written to standard output.
  15. #
  16. # This tool will also write up-to-date versions of the test files
  17. # all-{upper,lower,title}.html
  18. # and corresponding -ref files in the current directory.
  19. #
  20. __EOT
  21. exit 0;
  22. }
  23. my %allLower;
  24. my %allUpper;
  25. my %allTitle;
  26. my %compositions;
  27. my %gc;
  28. open FH, "< $ARGV[0]" or die "can't open $ARGV[0] (should be UnicodeData.txt)\n";
  29. while (<FH>) {
  30. chomp;
  31. my @fields = split /;/;
  32. next if ($fields[1] =~ /</); # ignore ranges etc
  33. my $usv = hex "0x$fields[0]";
  34. $allUpper{$usv} = $fields[12] if $fields[12] ne '';
  35. $allLower{$usv} = $fields[13] if $fields[13] ne '';
  36. $allTitle{$usv} = $fields[14] if $fields[14] ne '';
  37. $gc{$usv} = $fields[2];
  38. # we only care about non-singleton canonical decomps
  39. my $decomp = $fields[5];
  40. next if $decomp eq '' or $decomp =~ /</ or not $decomp =~ / /;
  41. $compositions{$decomp} = sprintf("%04X", $usv);
  42. }
  43. close FH;
  44. my %specialLower;
  45. my %specialUpper;
  46. my %specialTitle;
  47. my %charName;
  48. my @headerLines;
  49. open FH, "< $ARGV[1]" or die "can't open $ARGV[1] (should be SpecialCasing.txt)\n";
  50. while (<FH>) {
  51. chomp;
  52. m/#\s*(.+)$/;
  53. my $comment = $1;
  54. if ($comment =~ /^(SpecialCasing-|Date:)/) {
  55. push @headerLines, $comment;
  56. next;
  57. }
  58. s/#.*//;
  59. s/;\s*$//;
  60. next if $_ eq '';
  61. my @fields = split /; */;
  62. next unless (scalar @fields) == 4;
  63. my $usv = hex "0x$fields[0]";
  64. addIfSpecial(\%specialLower, $usv, $fields[1]);
  65. addIfSpecial(\%specialTitle, $usv, $fields[2]);
  66. addIfSpecial(\%specialUpper, $usv, $fields[3]);
  67. $charName{$usv} = $comment;
  68. }
  69. close FH;
  70. print <<__END__;
  71. /* This Source Code Form is subject to the terms of the Mozilla Public
  72. * License, v. 2.0. If a copy of the MPL was not distributed with this file,
  73. * You can obtain one at http://mozilla.org/MPL/2.0/. */
  74. /* Auto-generated from files in the Unicode Character Database
  75. by genSpecialCasingData.pl - do not edit! */
  76. #include "nsSpecialCasingData.h"
  77. #include "mozilla/ArrayUtils.h" // for ArrayLength
  78. #include <stdlib.h> // for bsearch
  79. __END__
  80. map { print "/* $_ */\n" } @headerLines;
  81. print <<__END__;
  82. using mozilla::unicode::MultiCharMapping;
  83. __END__
  84. printMappings('Lower', \%specialLower);
  85. printMappings('Upper', \%specialUpper);
  86. printMappings('Title', \%specialTitle);
  87. print <<__END__;
  88. static int CompareMCM(const void* aKey, const void* aElement)
  89. {
  90. const uint32_t ch = *static_cast<const uint32_t*>(aKey);
  91. const MultiCharMapping* mcm = static_cast<const MultiCharMapping*>(aElement);
  92. return int(ch) - int(mcm->mOriginalChar);
  93. }
  94. #define MAKE_SPECIAL_CASE_ACCESSOR(which) \\
  95. const MultiCharMapping* \\
  96. Special##which(uint32_t aChar) \\
  97. { \\
  98. const void* p = bsearch(&aChar, CaseSpecials_##which, \\
  99. mozilla::ArrayLength(CaseSpecials_##which), \\
  100. sizeof(MultiCharMapping), CompareMCM); \\
  101. return static_cast<const MultiCharMapping*>(p); \\
  102. }
  103. namespace mozilla {
  104. namespace unicode {
  105. MAKE_SPECIAL_CASE_ACCESSOR(Lower)
  106. MAKE_SPECIAL_CASE_ACCESSOR(Upper)
  107. MAKE_SPECIAL_CASE_ACCESSOR(Title)
  108. } // namespace unicode
  109. } // namespace mozilla
  110. __END__
  111. addSpecialsTo(\%allLower, \%specialLower);
  112. addSpecialsTo(\%allUpper, \%specialUpper);
  113. addSpecialsTo(\%allTitle, \%specialTitle);
  114. my $testFont = "../fonts/dejavu-sans/DejaVuSans.ttf";
  115. genTest('lower', \%allLower);
  116. genTest('upper', \%allUpper);
  117. genTitleTest();
  118. sub printMappings {
  119. my ($whichMapping, $hash) = @_;
  120. print "static const MultiCharMapping CaseSpecials_${whichMapping}[] = {\n";
  121. foreach my $key (sort { $a <=> $b } keys %$hash) {
  122. my @chars = split(/ /, $hash->{$key});
  123. printf " { 0x%04x, {0x%04x, 0x%04x, 0x%04x} }, // %s\n", $key,
  124. hex "0x0$chars[0]", hex "0x0$chars[1]", hex "0x0$chars[2]",
  125. "$charName{$key}";
  126. }
  127. print "};\n\n";
  128. };
  129. sub addIfSpecial {
  130. my ($hash, $usv, $mapping) = @_;
  131. return unless $mapping =~ / /;
  132. # only do compositions that start with the initial char
  133. foreach (keys %compositions) {
  134. $mapping =~ s/^$_/$compositions{$_}/;
  135. }
  136. $hash->{$usv} = $mapping;
  137. };
  138. sub addSpecialsTo {
  139. my ($hash, $specials) = @_;
  140. foreach my $key (keys %$specials) {
  141. $hash->{$key} = $specials->{$key};
  142. }
  143. };
  144. sub genTest {
  145. my ($whichMapping, $hash) = @_;
  146. open OUT, "> all-$whichMapping.html";
  147. print OUT <<__END__;
  148. <!DOCTYPE html>
  149. <html>
  150. <head>
  151. <meta http-equiv="Content-type" content="text/html; charset=utf-8">
  152. <style type="text/css">
  153. \@font-face { font-family: foo; src: url($testFont); }
  154. p { font-family: foo; text-transform: ${whichMapping}case; }
  155. </style>
  156. </head>
  157. <body>
  158. <p>
  159. __END__
  160. foreach my $key (sort { $a <=> $b } keys %$hash) {
  161. printf OUT "&#x%04X;", $key;
  162. print OUT " <!-- $charName{$key} -->" if exists $charName{$key};
  163. print OUT "\n";
  164. }
  165. print OUT <<__END__;
  166. </p>
  167. </body>
  168. </html>
  169. __END__
  170. close OUT;
  171. open OUT, "> all-$whichMapping-ref.html";
  172. print OUT <<__END__;
  173. <!DOCTYPE html>
  174. <html>
  175. <head>
  176. <meta http-equiv="Content-type" content="text/html; charset=utf-8">
  177. <style type="text/css">
  178. \@font-face { font-family: foo; src: url($testFont); }
  179. p { font-family: foo; }
  180. </style>
  181. </head>
  182. <body>
  183. <p>
  184. __END__
  185. foreach my $key (sort { $a <=> $b } keys %$hash) {
  186. print OUT join('', map { sprintf("&#x%s;", $_) } split(/ /, $hash->{$key}));
  187. print OUT " <!-- $charName{$key} -->" if exists $charName{$key};
  188. print OUT "\n";
  189. }
  190. print OUT <<__END__;
  191. </p>
  192. </body>
  193. </html>
  194. __END__
  195. close OUT;
  196. };
  197. sub genTitleTest {
  198. open OUT, "> all-title.html";
  199. print OUT <<__END__;
  200. <!DOCTYPE html>
  201. <html>
  202. <head>
  203. <meta http-equiv="Content-type" content="text/html; charset=utf-8">
  204. <style type="text/css">
  205. \@font-face { font-family: foo; src: url($testFont); }
  206. p { font-family: foo; text-transform: capitalize; }
  207. </style>
  208. </head>
  209. <body>
  210. <p>
  211. __END__
  212. foreach my $key (sort { $a <=> $b } keys %allTitle) {
  213. printf OUT "&#x%04X;x", $key;
  214. print OUT " <!-- $charName{$key} -->" if exists $charName{$key};
  215. print OUT "\n";
  216. }
  217. print OUT <<__END__;
  218. </p>
  219. </body>
  220. </html>
  221. __END__
  222. close OUT;
  223. open OUT, "> all-title-ref.html";
  224. print OUT <<__END__;
  225. <!DOCTYPE html>
  226. <html>
  227. <head>
  228. <meta http-equiv="Content-type" content="text/html; charset=utf-8">
  229. <style type="text/css">
  230. \@font-face { font-family: foo; src: url($testFont); }
  231. p { font-family: foo; }
  232. </style>
  233. </head>
  234. <body>
  235. <p>
  236. __END__
  237. foreach my $key (sort { $a <=> $b } keys %allTitle) {
  238. # capitalize is only applied to characters with GC=L* or N*...
  239. if ($gc{$key} =~ /^[LN]/) {
  240. # ...and those that are already uppercase are not transformed
  241. if (exists $allUpper{$key}) {
  242. print OUT join('', map { sprintf("&#x%s;", $_) } split(/ /, $allTitle{$key}));
  243. } else {
  244. printf OUT "&#x%04X;", $key;
  245. }
  246. print OUT "x";
  247. } else {
  248. printf OUT "&#x%04X;X", $key;
  249. }
  250. print OUT " <!-- $charName{$key} -->" if exists $charName{$key};
  251. print OUT "\n";
  252. }
  253. print OUT <<__END__;
  254. </p>
  255. </body>
  256. </html>
  257. __END__
  258. close OUT;
  259. };