123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507 |
- # ParagraphNonXS.pm: handle paragraph text.
- #
- # Copyright 2010, 2011, 2012, 2013, 2014, 2015 Free Software Foundation, Inc.
- #
- # This program is free software; you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation; either version 3 of the License,
- # or (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
- #
- # Original author: Patrice Dumas <pertusus@free.fr>
- # this module has nothing Texinfo specific. In contrast with existing
- # modules Text::Wrap, Text::Format, it keeps a state of the paragraph
- # and waits for text to be fed into it.
- package Texinfo::Convert::ParagraphNonXS;
- use 5.006;
- use strict;
- use Unicode::EastAsianWidth;
- use Carp qw(cluck);
- # initialize a paragraph object.
- sub new($;$)
- {
- my $class = shift;
- my $conf = shift;
- my $self = {'max' => 72, 'indent_length' => 0, 'counter' => 0,
- 'word_counter' => 0, 'space' => '', 'frenchspacing' => 0,
- 'lines_counter' => 0, 'end_line_count' => 0,
- 'unfilled' => 0 };
- if (defined($conf)) {
- foreach my $key (keys(%$conf)) {
- $self->{$key} = $conf->{$key};
- }
- }
- bless $self, $class;
- }
- # for debugging
- sub dump($)
- {
- my $self = shift;
- my $word = 'UNDEF';
- if (defined($self->{'word'})) {
- $word = $self->{'word'};
- }
- my $end_sentence = 'UNDEF';
- $end_sentence = $self->{'end_sentence'} if (defined($self->{'end_sentence'}));
- print STDERR "para ($self->{'counter'}+$self->{'word_counter'}) word: $word, space `$self->{'space'}' end_sentence: $end_sentence\n";
- }
- sub _cut_line($)
- {
- my $paragraph = shift;
- return '' if ($paragraph->{'ignore_columns'});
- return _end_line($paragraph);
- }
- sub end_line_count($)
- {
- my $paragraph = shift;
- return $paragraph->{'end_line_count'};
- }
- sub end_line($)
- {
- my $paragraph = shift;
- $paragraph->{'end_line_count'} = 0;
- return _end_line($paragraph);
- }
- # end a line.
- sub _end_line($)
- {
- my $paragraph = shift;
- $paragraph->{'counter'} = 0;
- $paragraph->{'space'} = '';
- if (defined($paragraph->{'indent_length_next'})) {
- $paragraph->{'indent_length'} = $paragraph->{'indent_length_next'};
- delete $paragraph->{'indent_length_next'};
- }
- $paragraph->{'lines_counter'}++;
- $paragraph->{'end_line_count'}++;
- print STDERR "END_LINE\n" if ($paragraph->{'DEBUG'});
- return "\n";
- }
- sub get_pending($)
- {
- my $paragraph = shift;
- my $result = '';
- if ($paragraph->{'space'}) {
- $result .= $paragraph->{'space'};
- }
- if (defined($paragraph->{'word'})) {
- $result .= $paragraph->{'word'};
- }
- return $result;
- }
- sub add_pending_word($;$)
- {
- my $paragraph = shift;
- my $add_spaces = shift;
- $paragraph->{'end_line_count'} = 0;
- return _add_pending_word($paragraph, $add_spaces);
- }
- # put a pending word and spaces in the result string.
- sub _add_pending_word($;$)
- {
- my $paragraph = shift;
- my $add_spaces = shift;
- my $result = '';
- if (defined($paragraph->{'word'}) or $add_spaces) {
- if ($paragraph->{'indent_length'} > $paragraph->{'counter'}) {
- $result .= ' ' x ($paragraph->{'indent_length'} - $paragraph->{'counter'});
- $paragraph->{'counter'} = $paragraph->{'indent_length'};
- print STDERR "INDENT($paragraph->{'counter'}+$paragraph->{'word_counter'})\n"
- if ($paragraph->{'DEBUG'});
- delete $paragraph->{'space'} unless $paragraph->{'unfilled'};
- }
- if ($paragraph->{'space'}) {
- $result .= $paragraph->{'space'};
- $paragraph->{'counter'} += length($paragraph->{'space'});
- print STDERR "ADD_SPACES($paragraph->{'counter'}+$paragraph->{'word_counter'})\n"
- if ($paragraph->{'DEBUG'});
-
- }
- if (defined($paragraph->{'word'})) {
- $result .= $paragraph->{'word'};
- $paragraph->{'counter'} += $paragraph->{'word_counter'};
- print STDERR "ADD_WORD[$paragraph->{'word'}]+$paragraph->{'word_counter'} ($paragraph->{'counter'})\n"
- if ($paragraph->{'DEBUG'});
- $paragraph->{'word'} = undef;
- $paragraph->{'last_char'} = undef;
- $paragraph->{'word_counter'} = 0;
- }
- $paragraph->{'space'} = '';
- }
- return $result;
- }
- # end a paragraph
- sub end($)
- {
- my $paragraph = shift;
- $paragraph->{'end_line_count'} = 0;
- print STDERR "PARA END\n" if ($paragraph->{'DEBUG'});
- my $result = _add_pending_word($paragraph, $paragraph->{'add_final_space'});
- if (!$paragraph->{'no_final_newline'} and $paragraph->{'counter'} != 0) {
- $result .= "\n";
- $paragraph->{'lines_counter'}++;
- $paragraph->{'end_line_count'}++;
- }
- return $result;
- }
- my $end_sentence_character = quotemeta('.?!');
- my $after_punctuation_characters = quotemeta('"\')]');
- # Add $WORD to paragraph, returning the text to be added to the paragraph.
- # Any end of sentence punctuation in $WORD that should be allowed to end a
- # sentence but which would otherwise be preceded by an upper-case letter should
- # instead by preceded by a backspace character.
- sub add_next($;$$)
- {
- my $paragraph = shift;
- my $word = shift;
- my $transparent = shift;
- $paragraph->{'end_line_count'} = 0;
- return _add_next($paragraph, $word, $transparent);
- }
- # add a word (without wrapping).
- sub _add_next($;$$$)
- {
- my $paragraph = shift;
- my $word = shift;
- my $transparent = shift;
- my $newlines_impossible = shift;
- my $result = '';
- if (defined($word)) {
- my $disinhibit = 0;
- # Reverse the insertion of any control characters in Plaintext.pm.
- if ($word =~ /\x08$/) {
- $disinhibit = 1;
- }
- $word =~ s/\x08//g;
- if (!defined($paragraph->{'word'})) {
- $paragraph->{'word'} = '';
- $paragraph->{'last_char'} = '';
- if ($paragraph->{'end_sentence'}
- and $paragraph->{'end_sentence'} > 0
- and !$paragraph->{'frenchspacing'}
- and $paragraph->{'counter'} != 0 and $paragraph->{'space'}) {
- # do not double space if there are leading spaces in word
- if ($word !~ /^\s/) {
- #$paragraph->{'space'} = ' ';
- $paragraph->{'space'} .= ' ' x (2 - length($paragraph->{'space'}));
- }
- delete $paragraph->{'end_sentence'};
- }
- }
-
- $paragraph->{'word'} .= $word;
- if (!$transparent) {
- if ($disinhibit) {
- $paragraph->{'last_char'} = 'a';
- } elsif ($word =~
- /([^$end_sentence_character$after_punctuation_characters])
- [$end_sentence_character$after_punctuation_characters]*$/x) {
- # Save the last character in $word before punctuation
- $paragraph->{'last_char'} = $1;
- }
- }
- if (!$newlines_impossible and $word =~ /\n/) {
- $result .= _add_pending_word ($paragraph);
- _end_line($paragraph);
- $paragraph->{'word_counter'} = 0;
- $paragraph->{'word'} = undef;
- $paragraph->{'last_char'} = undef;
- } else {
- my $word2;
- $word2 = $word;
- $word2 =~ s/[\177]//g;
- $paragraph->{'word_counter'} += length($word2);
- # We don't count DEL bytes here for INFO_SPECIAL_CHARS_QUOTE. We
- # shouldn't count combining characters for accents either: see the
- # t/converters_tests.t (at_commands_in_refs_utf8) test.
- }
- if ($paragraph->{'DEBUG'}) {
- my $para_word = 'UNDEF';;
- if (defined($paragraph->{'word'})) {
- $para_word = $paragraph->{'word'};
- }
- print STDERR "WORD+ $word -> $para_word\n";
- }
- # The $paragraph->{'counter'} != 0 is here to avoid having an
- # additional line output when the text is longer than the max.
- if ($paragraph->{'counter'} != 0 and
- $paragraph->{'counter'} + $paragraph->{'word_counter'} +
- length($paragraph->{'space'}) > $paragraph->{'max'}) {
- $result .= _cut_line($paragraph);
- }
- }
- return $result;
- }
- sub remove_end_sentence($)
- {
- my $paragraph = shift;
- $paragraph->{'end_sentence'} = 0;
- }
- sub add_end_sentence($;$) {
- my $paragraph = shift;
- my $value = shift;
- $paragraph->{'end_sentence'} = $value;
- }
- sub allow_end_sentence($)
- {
- my $paragraph = shift;
- printf STDERR "ALLOW END SENTENCE\n" if $paragraph->{'DEBUG'};
- $paragraph->{'last_char'} = 'a'; # lower-case
- }
- sub set_space_protection($$;$$$$)
- {
- my $paragraph = shift;
- my $space_protection = shift;
- my $ignore_columns = shift;
- my $keep_end_lines = shift;
- my $frenchspacing = shift;
- my $double_width_no_break = shift;
- $paragraph->{'protect_spaces'} = $space_protection
- if defined($space_protection);
- $paragraph->{'ignore_columns'} = $ignore_columns
- if defined($ignore_columns);
- $paragraph->{'keep_end_lines'} = $keep_end_lines
- if defined($keep_end_lines);
- if (!$paragraph->{'frenchspacing'} and $frenchspacing
- and $paragraph->{'end_sentence'} and $paragraph->{'counter'} != 0
- and $paragraph->{'space'} and !defined($paragraph->{'word'})) {
- $paragraph->{'space'} .= ' ' x (2 - length($paragraph->{'space'}));
- print STDERR "SWITCH frenchspacing end sentence space\n"
- if ($paragraph->{'DEBUG'});
- delete $paragraph->{'end_sentence'};
- }
- $paragraph->{'frenchspacing'} = $frenchspacing
- if defined($frenchspacing);
- $paragraph->{'double_width_no_break'} = $double_width_no_break
- if defined($double_width_no_break);
- # begin a word, to have something even if empty
- if ($space_protection) {
- _add_next($paragraph, '');
- }
- return '';
- }
- # Wrap $TEXT, returning the wrapped text, taking into account the current state
- # of $PARAGRAPH. Any end of sentence punctuation in $TEXT that should be
- # allowed to end a sentence but which would otherwise be preceded by an
- # upper-case letter should instead by preceded by a backspace character.
- sub add_text($$)
- {
- my $paragraph = shift;
- my $text = shift;
- $paragraph->{'end_line_count'} = 0;
- my $result = '';
- my $protect_spaces_flag = $paragraph->{'protect_spaces'};
- my @segments = split
- /([^\S\x{202f}\x{00a0}]+)|(\p{InFullwidth})|((?:[^\s\p{InFullwidth}]|[\x{202f}\x{00a0}])+)/,
- $text;
- # Check now if a newline exists anywhere in the string to
- # try to eliminate regex checks later.
- my $newline_possible_flag = ($text =~ /\n/);
- my $debug_flag = $paragraph->{'DEBUG'};
- while (@segments) {
- # $empty_segment should be an empty string; the other variables
- # here were recognized as field separators by splice.
- my ($empty_segment, $spaces, $fullwidth_segment, $added_word)
- = splice (@segments, 0, 4);
- if ($debug_flag) {
- my $word = 'UNDEF';
- $word = $paragraph->{'word'} if (defined($paragraph->{'word'}));
- print STDERR "p ($paragraph->{'counter'}+$paragraph->{'word_counter'}) s `"._print_escaped_spaces($paragraph->{'space'})."', w `$word'\n";
- #print STDERR "TEXT: "._print_escaped_spaces($text)."|\n"
- }
- # \x{202f}\x{00a0} are non breaking spaces
- if (defined $spaces) {
- print STDERR "SPACES($paragraph->{'counter'}) `"._print_escaped_spaces($spaces)."'\n" if $debug_flag;
- if ($protect_spaces_flag) {
- $paragraph->{'word'} .= $spaces;
- $paragraph->{'last_char'} = substr($spaces, -1);
- $paragraph->{'word_counter'} += length($spaces);
- $paragraph->{'word'} =~ s/\n/ /g;
- # The $paragraph->{'counter'} != 0 is here to avoid having an
- # additional line output when the text is longer than the max.
- if ($paragraph->{'counter'} != 0 and
- $paragraph->{'counter'} + $paragraph->{'word_counter'} +
- length($paragraph->{'space'}) > $paragraph->{'max'}) {
- $result .= _cut_line($paragraph);
- }
- } else {
- my $pending_word = $paragraph->{'word'};
- $result .= _add_pending_word($paragraph);
- if ($paragraph->{'counter'} != 0 or $paragraph->{'unfilled'}
- or (defined $pending_word)) {
- if ($paragraph->{'end_sentence'}
- and $paragraph->{'end_sentence'} > 0
- and !$paragraph->{'frenchspacing'}
- and !$paragraph->{'unfilled'}) {
- if (length($paragraph->{'space'}) >= 1 or length($spaces) > 1) {
- # more than one space, we can make sure tht there are only
- # 2 spaces
- my $all_spaces = substr($paragraph->{'space'} . $spaces, 0, 2);
- $all_spaces =~ s/[\n\r]/ /g;
- $all_spaces .= ' ' x (2 - length($all_spaces));
- $paragraph->{'space'} = $all_spaces;
- } else {
- # if there is only one space, we let it accumulate
- my $new_space = $spaces;
- $new_space =~ s/^[\n\r]/ /;
- $paragraph->{'space'} = $new_space;
- }
- } else {
- # Only save the first space
- if ($paragraph->{'unfilled'}
- or length($paragraph->{'space'}) < 1) {
- if ($spaces =~ /\n/) {
- if (!$paragraph->{'unfilled'}) {
- $paragraph->{'space'} = ' ';
- } elsif ($spaces =~ /\n/) {
- $result .= _add_pending_word ($paragraph);
- $result .= _end_line ($paragraph);
- }
- } else {
- if (!$paragraph->{'unfilled'}) {
- $spaces =~ s/\r/ /g;
- $paragraph->{'space'} .= substr ($spaces, 0, 1);
- } else {
- $paragraph->{'space'} .= $spaces;
- }
- }
- }
- }
- }
- }
- #print STDERR "delete END_SENTENCE($paragraph->{'end_sentence'}): spaces\n"
- # if (defined($paragraph->{'end_sentence'}) and $paragraph->{'DEBUG'});
- #delete $paragraph->{'end_sentence'};
- if ($paragraph->{'counter'} + length($paragraph->{'space'})
- > $paragraph->{'max'}) {
- $result .= _cut_line($paragraph);
- }
- if ($newline_possible_flag and !$paragraph->{'unfilled'}
- and $paragraph->{'keep_end_lines'} and $spaces =~ /\n/) {
- $result .= _end_line($paragraph);
- }
- } elsif (defined $added_word) {
- my $tmp = $added_word;
- if (defined $paragraph->{'last_char'}) {
- # Use 'last_char' here because _add_next overwrites it.
- $tmp = $paragraph->{'last_char'} . $tmp;
- }
- $result .= _add_next($paragraph, $added_word, undef,
- !$newline_possible_flag);
- # Check if it is considered as an end of sentence. There are two things
- # to check: one, that we have a ., ! or ?; and second, that it is not
- # preceded by an upper-case letter (ignoring some punctuation)
- if (defined($paragraph->{'end_sentence'})
- and $added_word =~ /^[$after_punctuation_characters]*$/o) {
- # do nothing in the case of a continuation of after_punctuation_characters
- } elsif (!$paragraph->{'unfilled'}
- and $tmp =~
- /(^|[^[:upper:]$after_punctuation_characters$end_sentence_character])
- [$after_punctuation_characters]*[$end_sentence_character]
- [$end_sentence_character\x08$after_punctuation_characters]*$/x) {
- if ($paragraph->{'frenchspacing'}) {
- $paragraph->{'end_sentence'} = -1;
- } else {
- $paragraph->{'end_sentence'} = 1;
- }
- print STDERR "END_SENTENCE\n" if ($paragraph->{'DEBUG'});
- } else {
- delete $paragraph->{'end_sentence'};
- print STDERR "delete END_SENTENCE($paragraph->{'end_sentence'}): text\n"
- if (defined($paragraph->{'end_sentence'}) and $paragraph->{'DEBUG'});
- }
- } elsif (defined $fullwidth_segment) {
- print STDERR "EAST_ASIAN\n" if ($paragraph->{'DEBUG'});
- if (!defined($paragraph->{'word'})) {
- $paragraph->{'word'} = '';
- }
- $paragraph->{'word'} .= $fullwidth_segment;
- $paragraph->{'last_char'} = $fullwidth_segment;
- $paragraph->{'word_counter'} += 2;
- if ($paragraph->{'counter'} != 0 and
- $paragraph->{'counter'} + $paragraph->{'word_counter'}
- > $paragraph->{'max'}) {
- $result .= _cut_line($paragraph);
- }
- if (!$paragraph->{'protect_spaces'}
- and !$paragraph->{'double_width_no_break'}) {
- $result .= _add_pending_word($paragraph);
- $paragraph->{'space'} = '';
- }
- delete $paragraph->{'end_sentence'};
- }
- }
- return $result;
- }
- # for debug
- sub _print_escaped_spaces($)
- {
- my $spaces = shift;
- my $result = '';
- foreach my $pos (0 .. length($spaces)-1) {
- my $char = substr($spaces, $pos, 1);
- if ($char eq ' ') {
- $result .= $char;
- } elsif ($char =~ /[\f\n]/) {
- $char =~ s/\f/\\f/;
- $char =~ s/\n/\\n/;
- $result .= $char;
- } elsif ($char =~ /\s/) {
- if (ord($char) <= hex(0xFFFF)) {
- $result .= '\x'.sprintf("%04x",ord($char));
- } else {
- $result .= '\x'.sprintf("%06x",ord($char));
- }
- } else {
- $result .= $char;
- }
- }
- return $result;
- }
- 1;
|