updateOperatorDictionary.pl 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500
  1. #!/usr/bin/perl
  2. # -*- Mode: Perl; tab-width: 2; indent-tabs-mode: nil; -*-
  3. # This Source Code Form is subject to the terms of the Mozilla Public
  4. # License, v. 2.0. If a copy of the MPL was not distributed with this
  5. # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  6. use XML::LibXSLT;
  7. use XML::LibXML;
  8. use LWP::Simple;
  9. # output files
  10. $FILE_UNICODE = "unicode.xml";
  11. $FILE_DICTIONARY = "dictionary.xml";
  12. $FILE_DIFFERENCES = "differences.txt";
  13. $FILE_NEW_DICTIONARY = "new_dictionary.txt";
  14. $FILE_SYNTAX_ERRORS = "syntax_errors.txt";
  15. $FILE_JS = "tests/stretchy-and-large-operators.js";
  16. # our dictionary (property file)
  17. $MOZ_DICTIONARY = "mathfont.properties";
  18. # dictionary provided by the W3C in "XML Entity Definitions for Characters"
  19. $WG_DICTIONARY_URL = "http://www.w3.org/2003/entities/2007xml/unicode.xml";
  20. # XSL stylesheet to extract relevant data from the dictionary
  21. $DICTIONARY_XSL = "operatorDictionary.xsl";
  22. # dictionary provided by the W3C transformed with operatorDictionary.xsl
  23. $WG_DICTIONARY = $FILE_DICTIONARY;
  24. if (!($#ARGV >= 0 &&
  25. ((($ARGV[0] eq "download") && $#ARGV <= 1) ||
  26. (($ARGV[0] eq "compare") && $#ARGV <= 1) ||
  27. (($ARGV[0] eq "check") && $#ARGV <= 0) ||
  28. (($ARGV[0] eq "make-js") && $#ARGV <= 0) ||
  29. (($ARGV[0] eq "clean") && $#ARGV <= 0)))) {
  30. &usage;
  31. }
  32. if ($ARGV[0] eq "download") {
  33. if ($#ARGV == 1) {
  34. $WG_DICTIONARY_URL = $ARGV[1];
  35. }
  36. print "Downloading $WG_DICTIONARY_URL...\n";
  37. getstore($WG_DICTIONARY_URL, $FILE_UNICODE);
  38. print "Converting $FILE_UNICODE into $FILE_DICTIONARY...\n";
  39. my $xslt = XML::LibXSLT->new();
  40. my $source = XML::LibXML->load_xml(location => $FILE_UNICODE);
  41. my $style_doc = XML::LibXML->load_xml(location => $DICTIONARY_XSL,
  42. no_cdata=>1);
  43. my $stylesheet = $xslt->parse_stylesheet($style_doc);
  44. my $results = $stylesheet->transform($source);
  45. open($file, ">$FILE_DICTIONARY") || die ("Couldn't open $FILE_DICTIONARY!");
  46. print $file $stylesheet->output_as_bytes($results);
  47. close($file);
  48. exit 0;
  49. }
  50. if ($ARGV[0] eq "clean") {
  51. unlink($FILE_UNICODE,
  52. $FILE_DICTIONARY,
  53. $FILE_DIFFERENCES,
  54. $FILE_NEW_DICTIONARY,
  55. $FILE_SYNTAX_ERRORS);
  56. exit 0;
  57. }
  58. if ($ARGV[0] eq "compare" && $#ARGV == 1) {
  59. $WG_DICTIONARY = $ARGV[1];
  60. }
  61. ################################################################################
  62. # structure of the dictionary used by this script:
  63. # - key: same as in mathfont.properties
  64. # - table:
  65. # index | value
  66. # 0 | description
  67. # 1 | lspace
  68. # 2 | rspace
  69. # 3 | minsize
  70. # 4 | largeop
  71. # 5 | movablelimits
  72. # 6 | stretchy
  73. # 7 | separator
  74. # 8 | accent
  75. # 9 | fence
  76. # 10 | symmetric
  77. # 11 | priority
  78. # 12 | linebreakstyle
  79. # 13 | direction
  80. # 14 | integral
  81. # 15 | mirrorable
  82. # 1) build %moz_hash from $MOZ_DICTIONARY
  83. print "loading $MOZ_DICTIONARY...\n";
  84. open($file, $MOZ_DICTIONARY) || die ("Couldn't open $MOZ_DICTIONARY!");
  85. print "building dictionary...\n";
  86. while (<$file>) {
  87. next unless (m/^operator\.(.*)$/);
  88. (m/^([\w|\.|\\]*)\s=\s(.*)\s#\s(.*)$/);
  89. # 1.1) build the key
  90. $key = $1;
  91. # 1.2) build the array
  92. $_ = $2;
  93. @value = ();
  94. $value[0] = $3;
  95. if (m/^(.*)lspace:(\d)(.*)$/) { $value[1] = $2; } else { $value[1] = "5"; }
  96. if (m/^(.*)rspace:(\d)(.*)$/) { $value[2] = $2; } else { $value[2] = "5"; }
  97. if (m/^(.*)minsize:(\d)(.*)$/) { $value[3] = $2; } else { $value[3] = "1"; }
  98. $value[4] = (m/^(.*)largeop(.*)$/);
  99. $value[5] = (m/^(.*)movablelimits(.*)$/);
  100. $value[6] = (m/^(.*)stretchy(.*)$/);
  101. $value[7] = (m/^(.*)separator(.*)$/);
  102. $value[8] = (m/^(.*)accent(.*)$/);
  103. $value[9] = (m/^(.*)fence(.*)$/);
  104. $value[10] = (m/^(.*)symmetric(.*)$/);
  105. $value[11] = ""; # we don't store "priority" in our dictionary
  106. $value[12] = ""; # we don't store "linebreakstyle" in our dictionary
  107. if (m/^(.*)direction:([a-z]*)(.*)$/) { $value[13] = $2; }
  108. else { $value[13] = ""; }
  109. $value[14] = (m/^(.*)integral(.*)$/);
  110. $value[15] = (m/^(.*)mirrorable(.*)$/);
  111. # 1.3) save the key and value
  112. $moz_hash{$key} = [ @value ];
  113. }
  114. close($file);
  115. ################################################################################
  116. # 2) If mode "make-js", generate tests/stretchy-and-large-operators.js and quit.
  117. # If mode "check", verify validity of our operator dictionary and quit.
  118. # If mode "compare", go to step 3)
  119. if ($ARGV[0] eq "make-js") {
  120. print "generating file $FILE_JS...\n";
  121. open($file_js, ">$FILE_JS") ||
  122. die ("Couldn't open $FILE_JS!");
  123. print $file_js "// This file is automatically generated. Do not edit.\n";
  124. print $file_js "var stretchy_and_large_operators = [";
  125. @moz_keys = (keys %moz_hash);
  126. while ($key = pop(@moz_keys)) {
  127. @moz = @{ $moz_hash{$key} };
  128. $_ = $key;
  129. (m/^operator\.([\w|\.|\\]*)\.(prefix|infix|postfix)$/);
  130. $opname = "\\$1.$2: ";
  131. if (@moz[4]) {
  132. print $file_js "['$opname', '$1','l','$2'],";
  133. }
  134. if (@moz[6]) {
  135. $_ = substr(@moz[13], 0, 1);
  136. print $file_js "['$opname', '$1','$_','$2'],";
  137. }
  138. }
  139. print $file_js "];\n";
  140. close($file_js);
  141. exit 0;
  142. }
  143. if ($ARGV[0] eq "check") {
  144. print "checking operator dictionary...\n";
  145. open($file_syntax_errors, ">$FILE_SYNTAX_ERRORS") ||
  146. die ("Couldn't open $FILE_SYNTAX_ERRORS!");
  147. $nb_errors = 0;
  148. $nb_warnings = 0;
  149. @moz_keys = (keys %moz_hash);
  150. # check the validity of our private data
  151. while ($key = pop(@moz_keys)) {
  152. @moz = @{ $moz_hash{$key} };
  153. $entry = &generateEntry($key, @moz);
  154. $valid = 1;
  155. if (!(@moz[13] eq "" ||
  156. @moz[13] eq "horizontal" ||
  157. @moz[13] eq "vertical")) {
  158. $valid = 0;
  159. $nb_errors++;
  160. print $file_syntax_errors "error: invalid direction \"$moz[13]\"\n";
  161. }
  162. if (!@moz[4] && @moz[14]) {
  163. $valid = 0;
  164. $nb_warnings++;
  165. print $file_syntax_errors "warning: operator is integral but not largeop\n";
  166. }
  167. $_ = @moz[0];
  168. if ((m/^(.*)[iI]ntegral(.*)$/) && !@moz[14]) {
  169. $valid = 0;
  170. $nb_warnings++;
  171. print $file_syntax_errors "warning: operator contains the term \"integral\" in its comment, but is not integral\n";
  172. }
  173. if (!$valid) {
  174. print $file_syntax_errors $entry;
  175. print $file_syntax_errors "\n";
  176. }
  177. }
  178. # check that all forms have the same direction.
  179. @moz_keys = (keys %moz_hash);
  180. while ($key = pop(@moz_keys)) {
  181. if (@{ $moz_hash{$key} }) {
  182. # the operator has not been removed from the hash table yet.
  183. $_ = $key;
  184. (m/^([\w|\.|\\]*)\.(prefix|infix|postfix)$/);
  185. $key_prefix = "$1.prefix";
  186. $key_infix = "$1.infix";
  187. $key_postfix = "$1.postfix";
  188. @moz_prefix = @{ $moz_hash{$key_prefix} };
  189. @moz_infix = @{ $moz_hash{$key_infix} };
  190. @moz_postfix = @{ $moz_hash{$key_postfix} };
  191. $same_direction = 1;
  192. if (@moz_prefix) {
  193. if (@moz_infix &&
  194. !($moz_infix[13] eq $moz_prefix[13])) {
  195. $same_direction = 0;
  196. }
  197. if (@moz_postfix &&
  198. !($moz_postfix[13] eq $moz_prefix[13])) {
  199. $same_direction = 0;
  200. }
  201. }
  202. if (@moz_infix) {
  203. if (@moz_postfix &&
  204. !($moz_postfix[13] eq $moz_infix[13])) {
  205. $same_direction = 0;
  206. }
  207. }
  208. if (!$same_direction) {
  209. $nb_errors++;
  210. print $file_syntax_errors
  211. "error: operator has a stretchy form, but all forms";
  212. print $file_syntax_errors
  213. " have not the same direction\n";
  214. if (@moz_prefix) {
  215. $_ = &generateEntry($key_prefix, @moz_prefix);
  216. print $file_syntax_errors $_;
  217. }
  218. if (@moz_infix) {
  219. $_ = &generateEntry($key_infix, @moz_infix);
  220. print $file_syntax_errors $_;
  221. }
  222. if (@moz_postfix) {
  223. $_ = &generateEntry($key_postfix, @moz_postfix);
  224. print $file_syntax_errors $_;
  225. }
  226. print $file_syntax_errors "\n";
  227. }
  228. if (@moz_prefix) {
  229. delete $moz_hash{$key.prefix};
  230. }
  231. if (@moz_infix) {
  232. delete $moz_hash{$key_infix};
  233. }
  234. if (@moz_postfix) {
  235. delete $moz_hash{$key_postfix};
  236. }
  237. }
  238. }
  239. close($file_syntax_errors);
  240. print "\n";
  241. if ($nb_errors > 0 || $nb_warnings > 0) {
  242. print "$nb_errors error(s) found\n";
  243. print "$nb_warnings warning(s) found\n";
  244. print "See output file $FILE_SYNTAX_ERRORS.\n\n";
  245. } else {
  246. print "No error found.\n\n";
  247. }
  248. exit 0;
  249. }
  250. ################################################################################
  251. # 3) build %wg_hash and @wg_keys from the page $WG_DICTIONARY
  252. print "loading $WG_DICTIONARY...\n";
  253. my $parser = XML::LibXML->new();
  254. my $doc = $parser->parse_file($WG_DICTIONARY);
  255. print "building dictionary...\n";
  256. @wg_keys = ();
  257. foreach my $entry ($doc->findnodes('/root/entry')) {
  258. # 3.1) build the key
  259. $key = "operator.";
  260. $_ = $entry->getAttribute("unicode");
  261. $_ = "$_-";
  262. while (m/^U?0(\w*)-(.*)$/) {
  263. # Concatenate .\uNNNN
  264. $key = "$key\\u$1";
  265. $_ = $2;
  266. }
  267. $_ = $entry->getAttribute("form"); # "Form"
  268. $key = "$key.$_";
  269. # 3.2) build the array
  270. @value = ();
  271. $value[0] = lc($entry->getAttribute("description"));
  272. $value[1] = $entry->getAttribute("lspace");
  273. if ($value[1] eq "") { $value[1] = "5"; }
  274. $value[2] = $entry->getAttribute("rspace");
  275. if ($value[2] eq "") { $value[2] = "5"; }
  276. $value[3] = $entry->getAttribute("minsize");
  277. if ($value[3] eq "") { $value[3] = "1"; }
  278. $_ = $entry->getAttribute("properties");
  279. $value[4] = (m/^(.*)largeop(.*)$/);
  280. $value[5] = (m/^(.*)movablelimits(.*)$/);
  281. $value[6] = (m/^(.*)stretchy(.*)$/);
  282. $value[7] = (m/^(.*)separator(.*)$/);
  283. $value[8] = (m/^(.*)accent(.*)$/);
  284. $value[9] = (m/^(.*)fence(.*)$/);
  285. $value[10] = (m/^(.*)symmetric(.*)$/);
  286. $value[15] = (m/^(.*)mirrorable(.*)$/);
  287. $value[11] = $entry->getAttribute("priority");
  288. $value[12] = $entry->getAttribute("linebreakstyle");
  289. # not stored in the WG dictionary
  290. $value[13] = ""; # direction
  291. $value[14] = ""; # integral
  292. # 3.3) save the key and value
  293. push(@wg_keys, $key);
  294. $wg_hash{$key} = [ @value ];
  295. }
  296. @wg_keys = reverse(@wg_keys);
  297. ################################################################################
  298. # 4) Compare the two dictionaries and output the result
  299. print "comparing dictionaries...\n";
  300. open($file_differences, ">$FILE_DIFFERENCES") ||
  301. die ("Couldn't open $FILE_DIFFERENCES!");
  302. open($file_new_dictionary, ">$FILE_NEW_DICTIONARY") ||
  303. die ("Couldn't open $FILE_NEW_DICTIONARY!");
  304. $conflicting = 0; $conflicting_stretching = 0;
  305. $new = 0; $new_stretching = 0;
  306. $obsolete = 0; $obsolete_stretching = 0;
  307. $unchanged = 0;
  308. # 4.1) look to the entries of the WG dictionary
  309. while ($key = pop(@wg_keys)) {
  310. @wg = @{ $wg_hash{$key} };
  311. delete $wg_hash{$key};
  312. $wg_value = &generateCommon(@wg);
  313. if (exists($moz_hash{$key})) {
  314. # entry is in both dictionary
  315. @moz = @{ $moz_hash{$key} };
  316. delete $moz_hash{$key};
  317. $moz_value = &generateCommon(@moz);
  318. if ($moz_value ne $wg_value) {
  319. # conflicting entry
  320. print $file_differences "[conflict]";
  321. $conflicting++;
  322. if ($moz[6] != $wg[6]) {
  323. print $file_differences "[stretching]";
  324. $conflicting_stretching++;
  325. }
  326. print $file_differences " - $key ($wg[0])\n";
  327. print $file_differences "-$moz_value\n+$wg_value\n\n";
  328. $_ = &completeCommon($wg_value, $key, @moz, @wg);
  329. print $file_new_dictionary $_;
  330. } else {
  331. # unchanged entry
  332. $unchanged++;
  333. $_ = &completeCommon($wg_value, $key, @moz, @wg);
  334. print $file_new_dictionary $_;
  335. }
  336. } else {
  337. # we don't have this entry in our dictionary yet
  338. print $file_differences "[new entry]";
  339. $new++;
  340. if ($wg[6]) {
  341. print $file_differences "[stretching]";
  342. $new_stretching++;
  343. }
  344. print $file_differences " - $key ($wg[0])\n";
  345. print $file_differences "-\n+$wg_value\n\n";
  346. $_ = &completeCommon($wg_value, $key, (), @wg);
  347. print $file_new_dictionary $_;
  348. }
  349. }
  350. print $file_new_dictionary
  351. "\n# Entries below are not part of the official MathML dictionary\n\n";
  352. # 4.2) look in our dictionary the remaining entries
  353. @moz_keys = (keys %moz_hash);
  354. @moz_keys = reverse(sort(@moz_keys));
  355. while ($key = pop(@moz_keys)) {
  356. @moz = @{ $moz_hash{$key} };
  357. $moz_value = &generateCommon(@moz);
  358. print $file_differences "[obsolete entry]";
  359. $obsolete++;
  360. if ($moz[6]) {
  361. print $file_differences "[stretching]";
  362. $obsolete_stretching++;
  363. }
  364. print $file_differences " - $key ($moz[0])\n";
  365. print $file_differences "-$moz_value\n+\n\n";
  366. $_ = &completeCommon($moz_value, $key, (), @moz);
  367. print $file_new_dictionary $_;
  368. }
  369. close($file_differences);
  370. close($file_new_dictionary);
  371. print "\n";
  372. print "- $obsolete obsolete entries ";
  373. print "($obsolete_stretching of them are related to stretching)\n";
  374. print "- $unchanged unchanged entries\n";
  375. print "- $conflicting conflicting entries ";
  376. print "($conflicting_stretching of them are related to stretching)\n";
  377. print "- $new new entries ";
  378. print "($new_stretching of them are related to stretching)\n";
  379. print "\nSee output files $FILE_DIFFERENCES and $FILE_NEW_DICTIONARY.\n\n";
  380. print "After having modified the dictionary, please run";
  381. print "./updateOperatorDictionary check\n\n";
  382. exit 0;
  383. ################################################################################
  384. sub usage {
  385. # display the accepted command syntax and quit
  386. print "usage:\n";
  387. print " ./updateOperatorDictionary.pl download [unicode.xml]\n";
  388. print " ./updateOperatorDictionary.pl compare [dictionary.xml]\n";
  389. print " ./updateOperatorDictionary.pl check\n";
  390. print " ./updateOperatorDictionary.pl make-js\n";
  391. print " ./updateOperatorDictionary.pl clean\n";
  392. exit 0;
  393. }
  394. sub generateCommon {
  395. # helper function to generate the string of data shared by both dictionaries
  396. my(@v) = @_;
  397. $entry = "lspace:$v[1] rspace:$v[2]";
  398. if ($v[3] ne "1") { $entry = "$entry minsize:$v[3]"; }
  399. if ($v[4]) { $entry = "$entry largeop"; }
  400. if ($v[5]) { $entry = "$entry movablelimits"; }
  401. if ($v[6]) { $entry = "$entry stretchy"; }
  402. if ($v[7]) { $entry = "$entry separator"; }
  403. if ($v[8]) { $entry = "$entry accent"; }
  404. if ($v[9]) { $entry = "$entry fence"; }
  405. if ($v[10]) { $entry = "$entry symmetric"; }
  406. if ($v[15]) { $entry = "$entry mirrorable"; }
  407. return $entry;
  408. }
  409. sub completeCommon {
  410. # helper to add key and private data to generateCommon
  411. my($entry, $key, @v_moz, @v_wg) = @_;
  412. $entry = "$key = $entry";
  413. if ($v_moz[13]) { $entry = "$entry direction:$v_moz[13]"; }
  414. if ($v_moz[14]) { $entry = "$entry integral"; }
  415. if ($v_moz[15]) { $entry = "$entry mirrorable"; }
  416. if ($v_moz[0]) {
  417. # keep our previous comment
  418. $entry = "$entry # $v_moz[0]";
  419. } else {
  420. # otherwise use the description given by the WG
  421. $entry = "$entry # $v_wg[0]";
  422. }
  423. $entry = "$entry\n";
  424. return $entry;
  425. }
  426. sub generateEntry {
  427. # helper function to generate an entry of our operator dictionary
  428. my($key, @moz) = @_;
  429. $entry = &generateCommon(@moz);
  430. $entry = &completeCommon($entry, $key, @moz, @moz);
  431. return $entry;
  432. }