ParagraphNonXS.pm 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. # ParagraphNonXS.pm: handle paragraph text.
  2. #
  3. # Copyright 2010, 2011, 2012, 2013, 2014, 2015 Free Software Foundation, Inc.
  4. #
  5. # This program is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation; either version 3 of the License,
  8. # or (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  17. #
  18. # Original author: Patrice Dumas <pertusus@free.fr>
  19. # this module has nothing Texinfo specific. In contrast with existing
  20. # modules Text::Wrap, Text::Format, it keeps a state of the paragraph
  21. # and waits for text to be fed into it.
  22. package Texinfo::Convert::ParagraphNonXS;
  23. use 5.006;
  24. use strict;
  25. use Unicode::EastAsianWidth;
  26. use Carp qw(cluck);
  27. # initialize a paragraph object.
  28. sub new($;$)
  29. {
  30. my $class = shift;
  31. my $conf = shift;
  32. my $self = {'max' => 72, 'indent_length' => 0, 'counter' => 0,
  33. 'word_counter' => 0, 'space' => '', 'frenchspacing' => 0,
  34. 'lines_counter' => 0, 'end_line_count' => 0,
  35. 'unfilled' => 0 };
  36. if (defined($conf)) {
  37. foreach my $key (keys(%$conf)) {
  38. $self->{$key} = $conf->{$key};
  39. }
  40. }
  41. bless $self, $class;
  42. }
  43. # for debugging
  44. sub dump($)
  45. {
  46. my $self = shift;
  47. my $word = 'UNDEF';
  48. if (defined($self->{'word'})) {
  49. $word = $self->{'word'};
  50. }
  51. my $end_sentence = 'UNDEF';
  52. $end_sentence = $self->{'end_sentence'} if (defined($self->{'end_sentence'}));
  53. print STDERR "para ($self->{'counter'}+$self->{'word_counter'}) word: $word, space `$self->{'space'}' end_sentence: $end_sentence\n";
  54. }
  55. sub _cut_line($)
  56. {
  57. my $paragraph = shift;
  58. return '' if ($paragraph->{'ignore_columns'});
  59. return _end_line($paragraph);
  60. }
  61. sub end_line_count($)
  62. {
  63. my $paragraph = shift;
  64. return $paragraph->{'end_line_count'};
  65. }
  66. sub end_line($)
  67. {
  68. my $paragraph = shift;
  69. $paragraph->{'end_line_count'} = 0;
  70. return _end_line($paragraph);
  71. }
  72. # end a line.
  73. sub _end_line($)
  74. {
  75. my $paragraph = shift;
  76. $paragraph->{'counter'} = 0;
  77. $paragraph->{'space'} = '';
  78. if (defined($paragraph->{'indent_length_next'})) {
  79. $paragraph->{'indent_length'} = $paragraph->{'indent_length_next'};
  80. delete $paragraph->{'indent_length_next'};
  81. }
  82. $paragraph->{'lines_counter'}++;
  83. $paragraph->{'end_line_count'}++;
  84. print STDERR "END_LINE\n" if ($paragraph->{'DEBUG'});
  85. return "\n";
  86. }
  87. sub get_pending($)
  88. {
  89. my $paragraph = shift;
  90. my $result = '';
  91. if ($paragraph->{'space'}) {
  92. $result .= $paragraph->{'space'};
  93. }
  94. if (defined($paragraph->{'word'})) {
  95. $result .= $paragraph->{'word'};
  96. }
  97. return $result;
  98. }
  99. sub add_pending_word($;$)
  100. {
  101. my $paragraph = shift;
  102. my $add_spaces = shift;
  103. $paragraph->{'end_line_count'} = 0;
  104. return _add_pending_word($paragraph, $add_spaces);
  105. }
  106. # put a pending word and spaces in the result string.
  107. sub _add_pending_word($;$)
  108. {
  109. my $paragraph = shift;
  110. my $add_spaces = shift;
  111. my $result = '';
  112. if (defined($paragraph->{'word'}) or $add_spaces) {
  113. if ($paragraph->{'indent_length'} > $paragraph->{'counter'}) {
  114. $result .= ' ' x ($paragraph->{'indent_length'} - $paragraph->{'counter'});
  115. $paragraph->{'counter'} = $paragraph->{'indent_length'};
  116. print STDERR "INDENT($paragraph->{'counter'}+$paragraph->{'word_counter'})\n"
  117. if ($paragraph->{'DEBUG'});
  118. delete $paragraph->{'space'} unless $paragraph->{'unfilled'};
  119. }
  120. if ($paragraph->{'space'}) {
  121. $result .= $paragraph->{'space'};
  122. $paragraph->{'counter'} += length($paragraph->{'space'});
  123. print STDERR "ADD_SPACES($paragraph->{'counter'}+$paragraph->{'word_counter'})\n"
  124. if ($paragraph->{'DEBUG'});
  125. }
  126. if (defined($paragraph->{'word'})) {
  127. $result .= $paragraph->{'word'};
  128. $paragraph->{'counter'} += $paragraph->{'word_counter'};
  129. print STDERR "ADD_WORD[$paragraph->{'word'}]+$paragraph->{'word_counter'} ($paragraph->{'counter'})\n"
  130. if ($paragraph->{'DEBUG'});
  131. $paragraph->{'word'} = undef;
  132. $paragraph->{'last_char'} = undef;
  133. $paragraph->{'word_counter'} = 0;
  134. }
  135. $paragraph->{'space'} = '';
  136. }
  137. return $result;
  138. }
  139. # end a paragraph
  140. sub end($)
  141. {
  142. my $paragraph = shift;
  143. $paragraph->{'end_line_count'} = 0;
  144. print STDERR "PARA END\n" if ($paragraph->{'DEBUG'});
  145. my $result = _add_pending_word($paragraph, $paragraph->{'add_final_space'});
  146. if (!$paragraph->{'no_final_newline'} and $paragraph->{'counter'} != 0) {
  147. $result .= "\n";
  148. $paragraph->{'lines_counter'}++;
  149. $paragraph->{'end_line_count'}++;
  150. }
  151. return $result;
  152. }
  153. my $end_sentence_character = quotemeta('.?!');
  154. my $after_punctuation_characters = quotemeta('"\')]');
  155. # Add $WORD to paragraph, returning the text to be added to the paragraph.
  156. # Any end of sentence punctuation in $WORD that should be allowed to end a
  157. # sentence but which would otherwise be preceded by an upper-case letter should
  158. # instead by preceded by a backspace character.
  159. sub add_next($;$$)
  160. {
  161. my $paragraph = shift;
  162. my $word = shift;
  163. my $transparent = shift;
  164. $paragraph->{'end_line_count'} = 0;
  165. return _add_next($paragraph, $word, $transparent);
  166. }
  167. # add a word (without wrapping).
  168. sub _add_next($;$$$)
  169. {
  170. my $paragraph = shift;
  171. my $word = shift;
  172. my $transparent = shift;
  173. my $newlines_impossible = shift;
  174. my $result = '';
  175. if (defined($word)) {
  176. my $disinhibit = 0;
  177. # Reverse the insertion of any control characters in Plaintext.pm.
  178. if ($word =~ /\x08$/) {
  179. $disinhibit = 1;
  180. }
  181. $word =~ s/\x08//g;
  182. if (!defined($paragraph->{'word'})) {
  183. $paragraph->{'word'} = '';
  184. $paragraph->{'last_char'} = '';
  185. if ($paragraph->{'end_sentence'}
  186. and $paragraph->{'end_sentence'} > 0
  187. and !$paragraph->{'frenchspacing'}
  188. and $paragraph->{'counter'} != 0 and $paragraph->{'space'}) {
  189. # do not double space if there are leading spaces in word
  190. if ($word !~ /^\s/) {
  191. #$paragraph->{'space'} = ' ';
  192. $paragraph->{'space'} .= ' ' x (2 - length($paragraph->{'space'}));
  193. }
  194. delete $paragraph->{'end_sentence'};
  195. }
  196. }
  197. $paragraph->{'word'} .= $word;
  198. if (!$transparent) {
  199. if ($disinhibit) {
  200. $paragraph->{'last_char'} = 'a';
  201. } elsif ($word =~
  202. /([^$end_sentence_character$after_punctuation_characters])
  203. [$end_sentence_character$after_punctuation_characters]*$/x) {
  204. # Save the last character in $word before punctuation
  205. $paragraph->{'last_char'} = $1;
  206. }
  207. }
  208. if (!$newlines_impossible and $word =~ /\n/) {
  209. $result .= _add_pending_word ($paragraph);
  210. _end_line($paragraph);
  211. $paragraph->{'word_counter'} = 0;
  212. $paragraph->{'word'} = undef;
  213. $paragraph->{'last_char'} = undef;
  214. } else {
  215. my $word2;
  216. $word2 = $word;
  217. $word2 =~ s/[\177]//g;
  218. $paragraph->{'word_counter'} += length($word2);
  219. # We don't count DEL bytes here for INFO_SPECIAL_CHARS_QUOTE. We
  220. # shouldn't count combining characters for accents either: see the
  221. # t/converters_tests.t (at_commands_in_refs_utf8) test.
  222. }
  223. if ($paragraph->{'DEBUG'}) {
  224. my $para_word = 'UNDEF';;
  225. if (defined($paragraph->{'word'})) {
  226. $para_word = $paragraph->{'word'};
  227. }
  228. print STDERR "WORD+ $word -> $para_word\n";
  229. }
  230. # The $paragraph->{'counter'} != 0 is here to avoid having an
  231. # additional line output when the text is longer than the max.
  232. if ($paragraph->{'counter'} != 0 and
  233. $paragraph->{'counter'} + $paragraph->{'word_counter'} +
  234. length($paragraph->{'space'}) > $paragraph->{'max'}) {
  235. $result .= _cut_line($paragraph);
  236. }
  237. }
  238. return $result;
  239. }
  240. sub remove_end_sentence($)
  241. {
  242. my $paragraph = shift;
  243. $paragraph->{'end_sentence'} = 0;
  244. }
  245. sub add_end_sentence($;$) {
  246. my $paragraph = shift;
  247. my $value = shift;
  248. $paragraph->{'end_sentence'} = $value;
  249. }
  250. sub allow_end_sentence($)
  251. {
  252. my $paragraph = shift;
  253. printf STDERR "ALLOW END SENTENCE\n" if $paragraph->{'DEBUG'};
  254. $paragraph->{'last_char'} = 'a'; # lower-case
  255. }
  256. sub set_space_protection($$;$$$$)
  257. {
  258. my $paragraph = shift;
  259. my $space_protection = shift;
  260. my $ignore_columns = shift;
  261. my $keep_end_lines = shift;
  262. my $frenchspacing = shift;
  263. my $double_width_no_break = shift;
  264. $paragraph->{'protect_spaces'} = $space_protection
  265. if defined($space_protection);
  266. $paragraph->{'ignore_columns'} = $ignore_columns
  267. if defined($ignore_columns);
  268. $paragraph->{'keep_end_lines'} = $keep_end_lines
  269. if defined($keep_end_lines);
  270. if (!$paragraph->{'frenchspacing'} and $frenchspacing
  271. and $paragraph->{'end_sentence'} and $paragraph->{'counter'} != 0
  272. and $paragraph->{'space'} and !defined($paragraph->{'word'})) {
  273. $paragraph->{'space'} .= ' ' x (2 - length($paragraph->{'space'}));
  274. print STDERR "SWITCH frenchspacing end sentence space\n"
  275. if ($paragraph->{'DEBUG'});
  276. delete $paragraph->{'end_sentence'};
  277. }
  278. $paragraph->{'frenchspacing'} = $frenchspacing
  279. if defined($frenchspacing);
  280. $paragraph->{'double_width_no_break'} = $double_width_no_break
  281. if defined($double_width_no_break);
  282. # begin a word, to have something even if empty
  283. if ($space_protection) {
  284. _add_next($paragraph, '');
  285. }
  286. return '';
  287. }
  288. # Wrap $TEXT, returning the wrapped text, taking into account the current state
  289. # of $PARAGRAPH. Any end of sentence punctuation in $TEXT that should be
  290. # allowed to end a sentence but which would otherwise be preceded by an
  291. # upper-case letter should instead by preceded by a backspace character.
  292. sub add_text($$)
  293. {
  294. my $paragraph = shift;
  295. my $text = shift;
  296. $paragraph->{'end_line_count'} = 0;
  297. my $result = '';
  298. my $protect_spaces_flag = $paragraph->{'protect_spaces'};
  299. my @segments = split
  300. /([^\S\x{202f}\x{00a0}]+)|(\p{InFullwidth})|((?:[^\s\p{InFullwidth}]|[\x{202f}\x{00a0}])+)/,
  301. $text;
  302. # Check now if a newline exists anywhere in the string to
  303. # try to eliminate regex checks later.
  304. my $newline_possible_flag = ($text =~ /\n/);
  305. my $debug_flag = $paragraph->{'DEBUG'};
  306. while (@segments) {
  307. # $empty_segment should be an empty string; the other variables
  308. # here were recognized as field separators by splice.
  309. my ($empty_segment, $spaces, $fullwidth_segment, $added_word)
  310. = splice (@segments, 0, 4);
  311. if ($debug_flag) {
  312. my $word = 'UNDEF';
  313. $word = $paragraph->{'word'} if (defined($paragraph->{'word'}));
  314. print STDERR "p ($paragraph->{'counter'}+$paragraph->{'word_counter'}) s `"._print_escaped_spaces($paragraph->{'space'})."', w `$word'\n";
  315. #print STDERR "TEXT: "._print_escaped_spaces($text)."|\n"
  316. }
  317. # \x{202f}\x{00a0} are non breaking spaces
  318. if (defined $spaces) {
  319. print STDERR "SPACES($paragraph->{'counter'}) `"._print_escaped_spaces($spaces)."'\n" if $debug_flag;
  320. if ($protect_spaces_flag) {
  321. $paragraph->{'word'} .= $spaces;
  322. $paragraph->{'last_char'} = substr($spaces, -1);
  323. $paragraph->{'word_counter'} += length($spaces);
  324. $paragraph->{'word'} =~ s/\n/ /g;
  325. # The $paragraph->{'counter'} != 0 is here to avoid having an
  326. # additional line output when the text is longer than the max.
  327. if ($paragraph->{'counter'} != 0 and
  328. $paragraph->{'counter'} + $paragraph->{'word_counter'} +
  329. length($paragraph->{'space'}) > $paragraph->{'max'}) {
  330. $result .= _cut_line($paragraph);
  331. }
  332. } else {
  333. my $pending_word = $paragraph->{'word'};
  334. $result .= _add_pending_word($paragraph);
  335. if ($paragraph->{'counter'} != 0 or $paragraph->{'unfilled'}
  336. or (defined $pending_word)) {
  337. if ($paragraph->{'end_sentence'}
  338. and $paragraph->{'end_sentence'} > 0
  339. and !$paragraph->{'frenchspacing'}
  340. and !$paragraph->{'unfilled'}) {
  341. if (length($paragraph->{'space'}) >= 1 or length($spaces) > 1) {
  342. # more than one space, we can make sure tht there are only
  343. # 2 spaces
  344. my $all_spaces = substr($paragraph->{'space'} . $spaces, 0, 2);
  345. $all_spaces =~ s/[\n\r]/ /g;
  346. $all_spaces .= ' ' x (2 - length($all_spaces));
  347. $paragraph->{'space'} = $all_spaces;
  348. } else {
  349. # if there is only one space, we let it accumulate
  350. my $new_space = $spaces;
  351. $new_space =~ s/^[\n\r]/ /;
  352. $paragraph->{'space'} = $new_space;
  353. }
  354. } else {
  355. # Only save the first space
  356. if ($paragraph->{'unfilled'}
  357. or length($paragraph->{'space'}) < 1) {
  358. if ($spaces =~ /\n/) {
  359. if (!$paragraph->{'unfilled'}) {
  360. $paragraph->{'space'} = ' ';
  361. } elsif ($spaces =~ /\n/) {
  362. $result .= _add_pending_word ($paragraph);
  363. $result .= _end_line ($paragraph);
  364. }
  365. } else {
  366. if (!$paragraph->{'unfilled'}) {
  367. $spaces =~ s/\r/ /g;
  368. $paragraph->{'space'} .= substr ($spaces, 0, 1);
  369. } else {
  370. $paragraph->{'space'} .= $spaces;
  371. }
  372. }
  373. }
  374. }
  375. }
  376. }
  377. #print STDERR "delete END_SENTENCE($paragraph->{'end_sentence'}): spaces\n"
  378. # if (defined($paragraph->{'end_sentence'}) and $paragraph->{'DEBUG'});
  379. #delete $paragraph->{'end_sentence'};
  380. if ($paragraph->{'counter'} + length($paragraph->{'space'})
  381. > $paragraph->{'max'}) {
  382. $result .= _cut_line($paragraph);
  383. }
  384. if ($newline_possible_flag and !$paragraph->{'unfilled'}
  385. and $paragraph->{'keep_end_lines'} and $spaces =~ /\n/) {
  386. $result .= _end_line($paragraph);
  387. }
  388. } elsif (defined $added_word) {
  389. my $tmp = $added_word;
  390. if (defined $paragraph->{'last_char'}) {
  391. # Use 'last_char' here because _add_next overwrites it.
  392. $tmp = $paragraph->{'last_char'} . $tmp;
  393. }
  394. $result .= _add_next($paragraph, $added_word, undef,
  395. !$newline_possible_flag);
  396. # Check if it is considered as an end of sentence. There are two things
  397. # to check: one, that we have a ., ! or ?; and second, that it is not
  398. # preceded by an upper-case letter (ignoring some punctuation)
  399. if (defined($paragraph->{'end_sentence'})
  400. and $added_word =~ /^[$after_punctuation_characters]*$/o) {
  401. # do nothing in the case of a continuation of after_punctuation_characters
  402. } elsif (!$paragraph->{'unfilled'}
  403. and $tmp =~
  404. /(^|[^[:upper:]$after_punctuation_characters$end_sentence_character])
  405. [$after_punctuation_characters]*[$end_sentence_character]
  406. [$end_sentence_character\x08$after_punctuation_characters]*$/x) {
  407. if ($paragraph->{'frenchspacing'}) {
  408. $paragraph->{'end_sentence'} = -1;
  409. } else {
  410. $paragraph->{'end_sentence'} = 1;
  411. }
  412. print STDERR "END_SENTENCE\n" if ($paragraph->{'DEBUG'});
  413. } else {
  414. delete $paragraph->{'end_sentence'};
  415. print STDERR "delete END_SENTENCE($paragraph->{'end_sentence'}): text\n"
  416. if (defined($paragraph->{'end_sentence'}) and $paragraph->{'DEBUG'});
  417. }
  418. } elsif (defined $fullwidth_segment) {
  419. print STDERR "EAST_ASIAN\n" if ($paragraph->{'DEBUG'});
  420. if (!defined($paragraph->{'word'})) {
  421. $paragraph->{'word'} = '';
  422. }
  423. $paragraph->{'word'} .= $fullwidth_segment;
  424. $paragraph->{'last_char'} = $fullwidth_segment;
  425. $paragraph->{'word_counter'} += 2;
  426. if ($paragraph->{'counter'} != 0 and
  427. $paragraph->{'counter'} + $paragraph->{'word_counter'}
  428. > $paragraph->{'max'}) {
  429. $result .= _cut_line($paragraph);
  430. }
  431. if (!$paragraph->{'protect_spaces'}
  432. and !$paragraph->{'double_width_no_break'}) {
  433. $result .= _add_pending_word($paragraph);
  434. $paragraph->{'space'} = '';
  435. }
  436. delete $paragraph->{'end_sentence'};
  437. }
  438. }
  439. return $result;
  440. }
  441. # for debug
  442. sub _print_escaped_spaces($)
  443. {
  444. my $spaces = shift;
  445. my $result = '';
  446. foreach my $pos (0 .. length($spaces)-1) {
  447. my $char = substr($spaces, $pos, 1);
  448. if ($char eq ' ') {
  449. $result .= $char;
  450. } elsif ($char =~ /[\f\n]/) {
  451. $char =~ s/\f/\\f/;
  452. $char =~ s/\n/\\n/;
  453. $result .= $char;
  454. } elsif ($char =~ /\s/) {
  455. if (ord($char) <= hex(0xFFFF)) {
  456. $result .= '\x'.sprintf("%04x",ord($char));
  457. } else {
  458. $result .= '\x'.sprintf("%06x",ord($char));
  459. }
  460. } else {
  461. $result .= $char;
  462. }
  463. }
  464. return $result;
  465. }
  466. 1;