NodeNameNormalization.pm 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. # NodeNameNormalization.pm: output tree as normalized node name.
  2. #
  3. # Copyright 2010, 2011, 2012, 2016 Free Software Foundation, Inc.
  4. #
  5. # This program is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation; either version 3 of the License,
  8. # or (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  17. #
  18. # Original author: Patrice Dumas <pertusus@free.fr>
  19. # the rules for conversion are decribed in the Texinfo manual, for
  20. # HTML crossrefs.
  21. package Texinfo::Convert::NodeNameNormalization;
  22. use 5.00405;
  23. use strict;
  24. use Unicode::Normalize;
  25. use Text::Unidecode;
  26. # for the accents definition
  27. use Texinfo::Common;
  28. # reuse some conversion hashes
  29. use Texinfo::Convert::Text;
  30. # use the hashes and functions
  31. use Texinfo::Convert::Unicode;
  32. require Exporter;
  33. use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  34. @ISA = qw(Exporter);
  35. # Items to export into callers namespace by default. Note: do not export
  36. # names by default without a very good reason. Use EXPORT_OK instead.
  37. # Do not simply export all your public functions/methods/constants.
  38. # This allows declaration use Texinfo::Convert::NodeNameNormalization ':all';
  39. # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
  40. # will save memory.
  41. %EXPORT_TAGS = ( 'all' => [ qw(
  42. normalize_node
  43. transliterate_texinfo
  44. ) ] );
  45. @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
  46. @EXPORT = qw(
  47. );
  48. my %normalize_node_brace_no_arg_commands
  49. = %Texinfo::Convert::Text::text_brace_no_arg_commands;
  50. foreach my $command (keys(%Texinfo::Convert::Unicode::unicode_character_brace_no_arg_commands)) {
  51. $normalize_node_brace_no_arg_commands{$command} =
  52. $Texinfo::Convert::Unicode::unicode_character_brace_no_arg_commands{$command};
  53. }
  54. my %normalize_node_no_brace_commands
  55. = %Texinfo::Common::no_brace_commands;
  56. $normalize_node_no_brace_commands{'*'} = ' ';
  57. my %accent_commands = %Texinfo::Common::accent_commands;
  58. my %ignored_brace_commands;
  59. foreach my $ignored_brace_command ('anchor', 'footnote', 'shortcaption',
  60. 'caption', 'hyphenation') {
  61. $ignored_brace_commands{$ignored_brace_command} = 1;
  62. }
  63. my %ignored_types;
  64. foreach my $type ('empty_line_after_command', 'preamble',
  65. 'empty_spaces_after_command', 'spaces_at_end',
  66. 'empty_spaces_before_argument', 'empty_spaces_before_paragraph',
  67. 'space_at_end_menu_node',
  68. 'empty_spaces_after_close_brace') {
  69. $ignored_types{$type} = 1;
  70. }
  71. sub normalize_node($)
  72. {
  73. my $root = shift;
  74. my $result = convert($root);
  75. $result = Unicode::Normalize::NFC($result);
  76. $result = _unicode_to_protected($result);
  77. $result = 'Top' if ($result =~ /^Top$/i);
  78. return $result;
  79. }
  80. sub transliterate_texinfo($;$)
  81. {
  82. my $root = shift;
  83. my $no_unidecode = shift;
  84. my $result = convert($root);
  85. $result = Unicode::Normalize::NFC($result);
  86. $result = _unicode_to_protected(
  87. _unicode_to_transliterate($result, $no_unidecode));
  88. return $result;
  89. }
  90. sub convert($)
  91. {
  92. my $root = shift;
  93. my $result = _convert($root);
  94. }
  95. sub _unicode_to_protected($)
  96. {
  97. my $text = shift;
  98. my $result = '';
  99. while ($text ne '') {
  100. if ($text =~ s/^([A-Za-z0-9]+)//o) {
  101. $result .= $1;
  102. } elsif ($text =~ s/^ //o) {
  103. $result .= '-';
  104. } elsif ($text =~ s/^(.)//o) {
  105. my $char = $1;
  106. if (exists($Texinfo::Convert::Unicode::unicode_simple_character_map{$char})) {
  107. $result .= '_' . lc($Texinfo::Convert::Unicode::unicode_simple_character_map{$char});
  108. } else {
  109. if (ord($char) <= hex(0xFFFF)) {
  110. $result .= '_' . lc(sprintf("%04x",ord($char)));
  111. } else {
  112. $result .= '__' . lc(sprintf("%06x",ord($char)));
  113. }
  114. }
  115. } else {
  116. warn "Bug: unknown character _unicode_to_protected (likely in infinite loop)\n";
  117. print STDERR "Text: !!$text!!\n";
  118. sleep 1;
  119. }
  120. }
  121. return $result;
  122. }
  123. sub _unicode_to_transliterate($;$)
  124. {
  125. my $text = shift;
  126. my $no_unidecode = shift;
  127. if (chomp($text)) {
  128. warn "Bug: end of line to transliterate: $text\n";
  129. }
  130. my $result = '';
  131. while ($text ne '') {
  132. if ($text =~ s/^([A-Za-z0-9 ]+)//o) {
  133. $result .= $1;
  134. } elsif ($text =~ s/^(.)//o) {
  135. my $char = $1;
  136. if (exists($Texinfo::Convert::Unicode::unicode_simple_character_map{$char})) {
  137. $result .= $char;
  138. } elsif (ord($char) <= hex(0xFFFF)
  139. and exists($Texinfo::Convert::Unicode::transliterate_map{uc(sprintf("%04x",ord($char)))})) {
  140. $result .= $Texinfo::Convert::Unicode::transliterate_map{uc(sprintf("%04x",ord($char)))};
  141. } elsif (ord($char) <= hex(0xFFFF)
  142. and exists($Texinfo::Convert::Unicode::diacritics_accent_commands{uc(sprintf("%04x",ord($char)))})) {
  143. $result .= '';
  144. # in this case, we want to avoid calling unidecode, as we are sure
  145. # that there is no useful transliteration of the unicode character
  146. # instead we want to keep it as is.
  147. # This is the case, for example, for @exclamdown, is corresponds
  148. # with x00a1, but unidecode transliterates it to a !, we want
  149. # to avoid that and keep x00a1.
  150. } elsif (ord($char) <= hex(0xFFFF)
  151. and exists($Texinfo::Convert::Unicode::no_transliterate_map{uc(sprintf("%04x",ord($char)))})) {
  152. $result .= $char;
  153. } else {
  154. if ($no_unidecode) {
  155. if (ord($char) <= hex(0xFFFF)
  156. and exists ($Texinfo::Convert::Unicode::transliterate_accent_map{uc(sprintf("%04x",ord($char)))})) {
  157. $result .= $Texinfo::Convert::Unicode::transliterate_accent_map{uc(sprintf("%04x",ord($char)))};
  158. } else {
  159. $result .= $char;
  160. }
  161. } else {
  162. $result .= unidecode($char);
  163. }
  164. }
  165. #print STDERR " ($no_unidecode) $text -> CHAR: ".ord($char)." ".uc(sprintf("%04x",ord($char)))."\n$result\n";
  166. } else {
  167. warn "Bug: unknown character _unicode_to_transliterate (likely in infinite loop)\n";
  168. print STDERR "Text: !!$text!!\n";
  169. sleep 1;
  170. }
  171. }
  172. return $result;
  173. }
  174. sub _convert($;$);
  175. sub _convert($;$)
  176. {
  177. my $root = shift;
  178. my $in_sc = shift;
  179. return '' if (($root->{'type'} and $ignored_types{$root->{'type'}})
  180. or ($root->{'cmdname'}
  181. and ($ignored_brace_commands{$root->{'cmdname'}}
  182. # here ignore the misc commands
  183. or ($root->{'args'} and $root->{'args'}->[0]
  184. and $root->{'args'}->[0]->{'type'}
  185. and ($root->{'args'}->[0]->{'type'} eq 'misc_line_arg'
  186. or $root->{'args'}->[0]->{'type'} eq 'misc_arg')))));
  187. my $result = '';
  188. if (defined($root->{'text'})) {
  189. $result = $root->{'text'};
  190. $result =~ s/\s+/ /go;
  191. $result = uc($result) if ($in_sc);
  192. }
  193. if ($root->{'cmdname'}) {
  194. my $command = $root->{'cmdname'};
  195. if (defined($normalize_node_no_brace_commands{$root->{'cmdname'}})) {
  196. return $normalize_node_no_brace_commands{$root->{'cmdname'}};
  197. } elsif (defined($normalize_node_brace_no_arg_commands{$root->{'cmdname'}})) {
  198. $command = $root->{'extra'}->{'clickstyle'}
  199. if ($root->{'extra'}
  200. and defined($root->{'extra'}->{'clickstyle'})
  201. and defined($normalize_node_brace_no_arg_commands{$root->{'extra'}->{'clickstyle'}}));
  202. my $result = $normalize_node_brace_no_arg_commands{$command};
  203. if ($in_sc and $Texinfo::Common::letter_no_arg_commands{$command}) {
  204. $result = uc($result);
  205. }
  206. return $result;
  207. # commands with braces
  208. } elsif ($accent_commands{$root->{'cmdname'}}) {
  209. return '' if (!$root->{'args'});
  210. my $accent_text = _convert($root->{'args'}->[0]);
  211. my $accented_char
  212. = Texinfo::Convert::Unicode::unicode_accent($accent_text,
  213. $root);
  214. if (!defined($accented_char)) {
  215. # In this case, the node normalization do not follow the specification,
  216. # but we cannot do better
  217. $accented_char = Texinfo::Convert::Text::ascii_accent($accent_text,
  218. $root);
  219. }
  220. if ($in_sc) {
  221. return uc ($accented_char);
  222. } else {
  223. return $accented_char;
  224. }
  225. #} elsif ($root->{'cmdname'} eq 'image') {
  226. # return _convert($root->{'args'}->[0]);
  227. } elsif ($Texinfo::Common::ref_commands{$root->{'cmdname'}}) {
  228. my @args_try_order;
  229. if ($root->{'cmdname'} eq 'inforef') {
  230. @args_try_order = (0, 1, 2);
  231. } else {
  232. @args_try_order = (0, 1, 2, 4, 3);
  233. }
  234. foreach my $index (@args_try_order) {
  235. if (defined($root->{'args'}->[$index])) {
  236. my $text = _convert($root->{'args'}->[$index]);
  237. return $text if (defined($text) and $text =~ /\S/);
  238. }
  239. }
  240. return '';
  241. # Here all the commands with args are processed, if they have
  242. # more than one arg the first one is used.
  243. } elsif ($root->{'args'} and $root->{'args'}->[0]
  244. and (($root->{'args'}->[0]->{'type'}
  245. and $root->{'args'}->[0]->{'type'} eq 'brace_command_arg')
  246. or $root->{'cmdname'} eq 'math')) {
  247. my $sc = 1 if ($root->{'cmdname'} eq 'sc' or $in_sc);
  248. return _convert($root->{'args'}->[0], $sc);
  249. }
  250. }
  251. if ($root->{'contents'}) {
  252. foreach my $content (@{$root->{'contents'}}) {
  253. $result .= _convert($content, $in_sc);
  254. }
  255. }
  256. $result = '{'.$result.'}'
  257. if ($root->{'type'} and $root->{'type'} eq 'bracketed');
  258. return $result;
  259. }
  260. 1;
  261. __END__
  262. =head1 NAME
  263. Texinfo::Convert::NodeNameNormalization - Normalize and transliterate Texinfo trees
  264. =head1 SYNOPSIS
  265. use Texinfo::Convert::NodeNameNormalization qw(normalize_node
  266. transliterate_texinfo);
  267. my $normalized = normalize_node({'contents' => $node_contents});
  268. my $file_name = transliterate_texinfo({'contents'
  269. => $section_contents});
  270. =head1 DESCRIPTION
  271. Texinfo::Convert::NodeNameNormalization allows to normalize node names,
  272. with C<normalize_node> following the specification described in the
  273. Texinfo manual for HTML Xref. This is usefull each time one want a
  274. unique identifier for Texinfo content that is only composed of letter,
  275. digits, C<-> and C<_>. In C<Texinfo::Parser> C<normalize_node> is used
  276. for node, floats and anchor names normalization, but also float
  277. types C<@acronym> and C<@abbr> first argument.
  278. It is also possible to transliterate non ascii letters, instead of mangling
  279. them, with C<transliterate_texinfo>, losing the uniqueness feature of
  280. normalized node names.
  281. =head1 METHODS
  282. =over
  283. =item $normalized = normalize_node($tree)
  284. The Texinfo I<$tree> is returned as a string, normalized as described in the
  285. Texinfo manual for HTML Xref.
  286. The result will be poor for Texinfo trees which are not @-command arguments
  287. (on an @-command line or in braces), for instance if the tree contains
  288. C<@node> or block commands.
  289. =item $transliterated = transliterate_texinfo($tree, $no_unidecode)
  290. The Texinfo I<$tree> is returned as a string, with non ascii letters
  291. transliterated as ascii, but otherwise similar with C<normalize_node>
  292. output. If the optional I<$no_unidecode> argument is set, C<Text::Unidecode>
  293. is not used for characters whose transliteration is not built-in.
  294. =back
  295. =head1 AUTHOR
  296. Patrice Dumas, E<lt>pertusus@free.frE<gt>
  297. =head1 COPYRIGHT AND LICENSE
  298. Copyright 2010, 2011, 2012 Free Software Foundation, Inc.
  299. This library is free software; you can redistribute it and/or modify
  300. it under the terms of the GNU General Public License as published by
  301. the Free Software Foundation; either version 3 of the License, or (at
  302. your option) any later version.
  303. =cut