123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354 |
- # NodeNameNormalization.pm: output tree as normalized node name.
- #
- # Copyright 2010, 2011, 2012, 2016 Free Software Foundation, Inc.
- #
- # This program is free software; you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation; either version 3 of the License,
- # or (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
- #
- # Original author: Patrice Dumas <pertusus@free.fr>
- # the rules for conversion are decribed in the Texinfo manual, for
- # HTML crossrefs.
- package Texinfo::Convert::NodeNameNormalization;
- use 5.00405;
- use strict;
- use Unicode::Normalize;
- use Text::Unidecode;
- # for the accents definition
- use Texinfo::Common;
- # reuse some conversion hashes
- use Texinfo::Convert::Text;
- # use the hashes and functions
- use Texinfo::Convert::Unicode;
- require Exporter;
- use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
- @ISA = qw(Exporter);
- # Items to export into callers namespace by default. Note: do not export
- # names by default without a very good reason. Use EXPORT_OK instead.
- # Do not simply export all your public functions/methods/constants.
- # This allows declaration use Texinfo::Convert::NodeNameNormalization ':all';
- # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
- # will save memory.
- %EXPORT_TAGS = ( 'all' => [ qw(
- normalize_node
- transliterate_texinfo
- ) ] );
- @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
- @EXPORT = qw(
- );
- my %normalize_node_brace_no_arg_commands
- = %Texinfo::Convert::Text::text_brace_no_arg_commands;
- foreach my $command (keys(%Texinfo::Convert::Unicode::unicode_character_brace_no_arg_commands)) {
- $normalize_node_brace_no_arg_commands{$command} =
- $Texinfo::Convert::Unicode::unicode_character_brace_no_arg_commands{$command};
- }
- my %normalize_node_no_brace_commands
- = %Texinfo::Common::no_brace_commands;
- $normalize_node_no_brace_commands{'*'} = ' ';
- my %accent_commands = %Texinfo::Common::accent_commands;
- my %ignored_brace_commands;
- foreach my $ignored_brace_command ('anchor', 'footnote', 'shortcaption',
- 'caption', 'hyphenation') {
- $ignored_brace_commands{$ignored_brace_command} = 1;
- }
- my %ignored_types;
- foreach my $type ('empty_line_after_command', 'preamble',
- 'empty_spaces_after_command', 'spaces_at_end',
- 'empty_spaces_before_argument', 'empty_spaces_before_paragraph',
- 'space_at_end_menu_node',
- 'empty_spaces_after_close_brace') {
- $ignored_types{$type} = 1;
- }
- sub normalize_node($)
- {
- my $root = shift;
- my $result = convert($root);
- $result = Unicode::Normalize::NFC($result);
- $result = _unicode_to_protected($result);
- $result = 'Top' if ($result =~ /^Top$/i);
- return $result;
- }
- sub transliterate_texinfo($;$)
- {
- my $root = shift;
- my $no_unidecode = shift;
- my $result = convert($root);
- $result = Unicode::Normalize::NFC($result);
- $result = _unicode_to_protected(
- _unicode_to_transliterate($result, $no_unidecode));
- return $result;
- }
- sub convert($)
- {
- my $root = shift;
- my $result = _convert($root);
- }
- sub _unicode_to_protected($)
- {
- my $text = shift;
- my $result = '';
- while ($text ne '') {
- if ($text =~ s/^([A-Za-z0-9]+)//o) {
- $result .= $1;
- } elsif ($text =~ s/^ //o) {
- $result .= '-';
- } elsif ($text =~ s/^(.)//o) {
- my $char = $1;
- if (exists($Texinfo::Convert::Unicode::unicode_simple_character_map{$char})) {
- $result .= '_' . lc($Texinfo::Convert::Unicode::unicode_simple_character_map{$char});
- } else {
- if (ord($char) <= hex(0xFFFF)) {
- $result .= '_' . lc(sprintf("%04x",ord($char)));
- } else {
- $result .= '__' . lc(sprintf("%06x",ord($char)));
- }
- }
- } else {
- warn "Bug: unknown character _unicode_to_protected (likely in infinite loop)\n";
- print STDERR "Text: !!$text!!\n";
- sleep 1;
- }
- }
- return $result;
- }
- sub _unicode_to_transliterate($;$)
- {
- my $text = shift;
- my $no_unidecode = shift;
- if (chomp($text)) {
- warn "Bug: end of line to transliterate: $text\n";
- }
- my $result = '';
- while ($text ne '') {
- if ($text =~ s/^([A-Za-z0-9 ]+)//o) {
- $result .= $1;
- } elsif ($text =~ s/^(.)//o) {
- my $char = $1;
- if (exists($Texinfo::Convert::Unicode::unicode_simple_character_map{$char})) {
- $result .= $char;
- } elsif (ord($char) <= hex(0xFFFF)
- and exists($Texinfo::Convert::Unicode::transliterate_map{uc(sprintf("%04x",ord($char)))})) {
- $result .= $Texinfo::Convert::Unicode::transliterate_map{uc(sprintf("%04x",ord($char)))};
- } elsif (ord($char) <= hex(0xFFFF)
- and exists($Texinfo::Convert::Unicode::diacritics_accent_commands{uc(sprintf("%04x",ord($char)))})) {
- $result .= '';
- # in this case, we want to avoid calling unidecode, as we are sure
- # that there is no useful transliteration of the unicode character
- # instead we want to keep it as is.
- # This is the case, for example, for @exclamdown, is corresponds
- # with x00a1, but unidecode transliterates it to a !, we want
- # to avoid that and keep x00a1.
- } elsif (ord($char) <= hex(0xFFFF)
- and exists($Texinfo::Convert::Unicode::no_transliterate_map{uc(sprintf("%04x",ord($char)))})) {
- $result .= $char;
- } else {
- if ($no_unidecode) {
- if (ord($char) <= hex(0xFFFF)
- and exists ($Texinfo::Convert::Unicode::transliterate_accent_map{uc(sprintf("%04x",ord($char)))})) {
- $result .= $Texinfo::Convert::Unicode::transliterate_accent_map{uc(sprintf("%04x",ord($char)))};
- } else {
- $result .= $char;
- }
- } else {
- $result .= unidecode($char);
- }
- }
- #print STDERR " ($no_unidecode) $text -> CHAR: ".ord($char)." ".uc(sprintf("%04x",ord($char)))."\n$result\n";
- } else {
- warn "Bug: unknown character _unicode_to_transliterate (likely in infinite loop)\n";
- print STDERR "Text: !!$text!!\n";
- sleep 1;
- }
- }
- return $result;
- }
- sub _convert($;$);
- sub _convert($;$)
- {
- my $root = shift;
- my $in_sc = shift;
- return '' if (($root->{'type'} and $ignored_types{$root->{'type'}})
- or ($root->{'cmdname'}
- and ($ignored_brace_commands{$root->{'cmdname'}}
- # here ignore the misc commands
- or ($root->{'args'} and $root->{'args'}->[0]
- and $root->{'args'}->[0]->{'type'}
- and ($root->{'args'}->[0]->{'type'} eq 'misc_line_arg'
- or $root->{'args'}->[0]->{'type'} eq 'misc_arg')))));
- my $result = '';
- if (defined($root->{'text'})) {
- $result = $root->{'text'};
- $result =~ s/\s+/ /go;
- $result = uc($result) if ($in_sc);
- }
- if ($root->{'cmdname'}) {
- my $command = $root->{'cmdname'};
- if (defined($normalize_node_no_brace_commands{$root->{'cmdname'}})) {
- return $normalize_node_no_brace_commands{$root->{'cmdname'}};
- } elsif (defined($normalize_node_brace_no_arg_commands{$root->{'cmdname'}})) {
- $command = $root->{'extra'}->{'clickstyle'}
- if ($root->{'extra'}
- and defined($root->{'extra'}->{'clickstyle'})
- and defined($normalize_node_brace_no_arg_commands{$root->{'extra'}->{'clickstyle'}}));
- my $result = $normalize_node_brace_no_arg_commands{$command};
- if ($in_sc and $Texinfo::Common::letter_no_arg_commands{$command}) {
- $result = uc($result);
- }
- return $result;
- # commands with braces
- } elsif ($accent_commands{$root->{'cmdname'}}) {
- return '' if (!$root->{'args'});
- my $accent_text = _convert($root->{'args'}->[0]);
- my $accented_char
- = Texinfo::Convert::Unicode::unicode_accent($accent_text,
- $root);
- if (!defined($accented_char)) {
- # In this case, the node normalization do not follow the specification,
- # but we cannot do better
- $accented_char = Texinfo::Convert::Text::ascii_accent($accent_text,
- $root);
- }
- if ($in_sc) {
- return uc ($accented_char);
- } else {
- return $accented_char;
- }
- #} elsif ($root->{'cmdname'} eq 'image') {
- # return _convert($root->{'args'}->[0]);
- } elsif ($Texinfo::Common::ref_commands{$root->{'cmdname'}}) {
- my @args_try_order;
- if ($root->{'cmdname'} eq 'inforef') {
- @args_try_order = (0, 1, 2);
- } else {
- @args_try_order = (0, 1, 2, 4, 3);
- }
- foreach my $index (@args_try_order) {
- if (defined($root->{'args'}->[$index])) {
- my $text = _convert($root->{'args'}->[$index]);
- return $text if (defined($text) and $text =~ /\S/);
- }
- }
- return '';
- # Here all the commands with args are processed, if they have
- # more than one arg the first one is used.
- } elsif ($root->{'args'} and $root->{'args'}->[0]
- and (($root->{'args'}->[0]->{'type'}
- and $root->{'args'}->[0]->{'type'} eq 'brace_command_arg')
- or $root->{'cmdname'} eq 'math')) {
- my $sc = 1 if ($root->{'cmdname'} eq 'sc' or $in_sc);
- return _convert($root->{'args'}->[0], $sc);
- }
- }
- if ($root->{'contents'}) {
- foreach my $content (@{$root->{'contents'}}) {
- $result .= _convert($content, $in_sc);
- }
- }
- $result = '{'.$result.'}'
- if ($root->{'type'} and $root->{'type'} eq 'bracketed');
- return $result;
- }
- 1;
- __END__
- =head1 NAME
- Texinfo::Convert::NodeNameNormalization - Normalize and transliterate Texinfo trees
- =head1 SYNOPSIS
- use Texinfo::Convert::NodeNameNormalization qw(normalize_node
- transliterate_texinfo);
- my $normalized = normalize_node({'contents' => $node_contents});
- my $file_name = transliterate_texinfo({'contents'
- => $section_contents});
- =head1 DESCRIPTION
- Texinfo::Convert::NodeNameNormalization allows to normalize node names,
- with C<normalize_node> following the specification described in the
- Texinfo manual for HTML Xref. This is usefull each time one want a
- unique identifier for Texinfo content that is only composed of letter,
- digits, C<-> and C<_>. In C<Texinfo::Parser> C<normalize_node> is used
- for node, floats and anchor names normalization, but also float
- types C<@acronym> and C<@abbr> first argument.
- It is also possible to transliterate non ascii letters, instead of mangling
- them, with C<transliterate_texinfo>, losing the uniqueness feature of
- normalized node names.
- =head1 METHODS
- =over
- =item $normalized = normalize_node($tree)
- The Texinfo I<$tree> is returned as a string, normalized as described in the
- Texinfo manual for HTML Xref.
- The result will be poor for Texinfo trees which are not @-command arguments
- (on an @-command line or in braces), for instance if the tree contains
- C<@node> or block commands.
- =item $transliterated = transliterate_texinfo($tree, $no_unidecode)
- The Texinfo I<$tree> is returned as a string, with non ascii letters
- transliterated as ascii, but otherwise similar with C<normalize_node>
- output. If the optional I<$no_unidecode> argument is set, C<Text::Unidecode>
- is not used for characters whose transliteration is not built-in.
- =back
- =head1 AUTHOR
- Patrice Dumas, E<lt>pertusus@free.frE<gt>
- =head1 COPYRIGHT AND LICENSE
- Copyright 2010, 2011, 2012 Free Software Foundation, Inc.
- This library is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 3 of the License, or (at
- your option) any later version.
- =cut
|