coreutils-8.31-i18n-1.patch 167 KB


  1. Submitted by: DJ Lucas (dj_AT_linuxfromscratch_DOT_org)
  2. Date: 2017-03-12
  3. Initial Package Version: 8.27
  4. Upstream Status: Rejected
  5. Origin: Based on Fedora's i18n patches at
  6. http://pkgs.fedoraproject.org/cgit/rpms/coreutils.git/tree/
  7. Description: Fixes i18n issues with various Coreutils programs
  8. diff -Naurp coreutils-8.27-orig/bootstrap.conf coreutils-8.27/bootstrap.conf
  9. --- coreutils-8.27-orig/bootstrap.conf 2017-03-07 23:34:06.000000000 -0600
  10. +++ coreutils-8.27/bootstrap.conf 2017-03-11 23:47:38.068058445 -0600
  11. @@ -152,6 +152,7 @@ gnulib_modules="
  12. maintainer-makefile
  13. malloc-gnu
  14. manywarnings
  15. + mbfile
  16. mbrlen
  17. mbrtowc
  18. mbsalign
  19. diff -Naurp coreutils-8.27-orig/configure.ac coreutils-8.27/configure.ac
  20. --- coreutils-8.27-orig/configure.ac 2017-02-26 08:52:29.000000000 -0600
  21. +++ coreutils-8.27/configure.ac 2017-03-11 23:47:38.068058445 -0600
  22. @@ -429,6 +429,8 @@ fi
  23. # I'm leaving it here for now. This whole thing needs to be modernized...
  24. gl_WINSIZE_IN_PTEM
  25. +gl_MBFILE
  26. +
  27. gl_HEADER_TIOCGWINSZ_IN_TERMIOS_H
  28. if test $gl_cv_sys_tiocgwinsz_needs_termios_h = no && \
  29. diff -Naurp coreutils-8.27-orig/lib/linebuffer.h coreutils-8.27/lib/linebuffer.h
  30. --- coreutils-8.27-orig/lib/linebuffer.h 2017-01-01 16:35:38.000000000 -0600
  31. +++ coreutils-8.27/lib/linebuffer.h 2017-03-11 23:47:13.089286391 -0600
  32. @@ -21,6 +21,11 @@
  33. # include <stdio.h>
  34. +/* Get mbstate_t. */
  35. +# if HAVE_WCHAR_H
  36. +# include <wchar.h>
  37. +# endif
  38. +
  39. /* A 'struct linebuffer' holds a line of text. */
  40. struct linebuffer
  41. @@ -28,6 +33,9 @@ struct linebuffer
  42. size_t size; /* Allocated. */
  43. size_t length; /* Used. */
  44. char *buffer;
  45. +# if HAVE_WCHAR_H
  46. + mbstate_t state;
  47. +# endif
  48. };
  49. /* Initialize linebuffer LINEBUFFER for use. */
  50. diff -Naurp coreutils-8.27-orig/lib/mbfile.c coreutils-8.27/lib/mbfile.c
  51. --- coreutils-8.27-orig/lib/mbfile.c 1969-12-31 18:00:00.000000000 -0600
  52. +++ coreutils-8.27/lib/mbfile.c 2017-03-11 23:47:38.069058397 -0600
  53. @@ -0,0 +1,3 @@
  54. +#include <config.h>
  55. +#define MBFILE_INLINE _GL_EXTERN_INLINE
  56. +#include "mbfile.h"
  57. diff -Naurp coreutils-8.27-orig/lib/mbfile.h coreutils-8.27/lib/mbfile.h
  58. --- coreutils-8.27-orig/lib/mbfile.h 1969-12-31 18:00:00.000000000 -0600
  59. +++ coreutils-8.27/lib/mbfile.h 2017-03-11 23:47:38.069058397 -0600
  60. @@ -0,0 +1,255 @@
  61. +/* Multibyte character I/O: macros for multi-byte encodings.
  62. + Copyright (C) 2001, 2005, 2009-2017 Free Software Foundation, Inc.
  63. +
  64. + This program is free software: you can redistribute it and/or modify
  65. + it under the terms of the GNU General Public License as published by
  66. + the Free Software Foundation; either version 3 of the License, or
  67. + (at your option) any later version.
  68. +
  69. + This program is distributed in the hope that it will be useful,
  70. + but WITHOUT ANY WARRANTY; without even the implied warranty of
  71. + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  72. + GNU General Public License for more details.
  73. +
  74. + You should have received a copy of the GNU General Public License
  75. + along with this program. If not, see <http://www.gnu.org/licenses/>. */
  76. +
  77. +/* Written by Mitsuru Chinen <mchinen@yamato.ibm.com>
  78. + and Bruno Haible <bruno@clisp.org>. */
  79. +
  80. +/* The macros in this file implement multi-byte character input from a
  81. + stream.
  82. +
  83. + mb_file_t
  84. + is the type for multibyte character input stream, usable for variable
  85. + declarations.
  86. +
  87. + mbf_char_t
  88. + is the type for multibyte character or EOF, usable for variable
  89. + declarations.
  90. +
  91. + mbf_init (mbf, stream)
  92. + initializes the MB_FILE for reading from stream.
  93. +
  94. + mbf_getc (mbc, mbf)
  95. + reads the next multibyte character from mbf and stores it in mbc.
  96. +
  97. + mb_iseof (mbc)
  98. + returns true if mbc represents the EOF value.
  99. +
  100. + Here are the function prototypes of the macros.
  101. +
  102. + extern void mbf_init (mb_file_t mbf, FILE *stream);
  103. + extern void mbf_getc (mbf_char_t mbc, mb_file_t mbf);
  104. + extern bool mb_iseof (const mbf_char_t mbc);
  105. + */
  106. +
  107. +#ifndef _MBFILE_H
  108. +#define _MBFILE_H 1
  109. +
  110. +#include <assert.h>
  111. +#include <stdbool.h>
  112. +#include <stdio.h>
  113. +#include <string.h>
  114. +
  115. +/* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
  116. + <wchar.h>.
  117. + BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
  118. + <wchar.h>. */
  119. +#include <stdio.h>
  120. +#include <time.h>
  121. +#include <wchar.h>
  122. +
  123. +#include "mbchar.h"
  124. +
  125. +#ifndef _GL_INLINE_HEADER_BEGIN
  126. + #error "Please include config.h first."
  127. +#endif
  128. +_GL_INLINE_HEADER_BEGIN
  129. +#ifndef MBFILE_INLINE
  130. +# define MBFILE_INLINE _GL_INLINE
  131. +#endif
  132. +
  133. +struct mbfile_multi {
  134. + FILE *fp;
  135. + bool eof_seen;
  136. + bool have_pushback;
  137. + mbstate_t state;
  138. + unsigned int bufcount;
  139. + char buf[MBCHAR_BUF_SIZE];
  140. + struct mbchar pushback;
  141. +};
  142. +
  143. +MBFILE_INLINE void
  144. +mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf)
  145. +{
  146. + size_t bytes;
  147. +
  148. + /* If EOF has already been seen, don't use getc. This matters if
  149. + mbf->fp is connected to an interactive tty. */
  150. + if (mbf->eof_seen)
  151. + goto eof;
  152. +
  153. + /* Return character pushed back, if there is one. */
  154. + if (mbf->have_pushback)
  155. + {
  156. + mb_copy (mbc, &mbf->pushback);
  157. + mbf->have_pushback = false;
  158. + return;
  159. + }
  160. +
  161. + /* Before using mbrtowc, we need at least one byte. */
  162. + if (mbf->bufcount == 0)
  163. + {
  164. + int c = getc (mbf->fp);
  165. + if (c == EOF)
  166. + {
  167. + mbf->eof_seen = true;
  168. + goto eof;
  169. + }
  170. + mbf->buf[0] = (unsigned char) c;
  171. + mbf->bufcount++;
  172. + }
  173. +
  174. + /* Handle most ASCII characters quickly, without calling mbrtowc(). */
  175. + if (mbf->bufcount == 1 && mbsinit (&mbf->state) && is_basic (mbf->buf[0]))
  176. + {
  177. + /* These characters are part of the basic character set. ISO C 99
  178. + guarantees that their wide character code is identical to their
  179. + char code. */
  180. + mbc->wc = mbc->buf[0] = mbf->buf[0];
  181. + mbc->wc_valid = true;
  182. + mbc->ptr = &mbc->buf[0];
  183. + mbc->bytes = 1;
  184. + mbf->bufcount = 0;
  185. + return;
  186. + }
  187. +
  188. + /* Use mbrtowc on an increasing number of bytes. Read only as many bytes
  189. + from mbf->fp as needed. This is needed to give reasonable interactive
  190. + behaviour when mbf->fp is connected to an interactive tty. */
  191. + for (;;)
  192. + {
  193. + /* We don't know whether the 'mbrtowc' function updates the state when
  194. + it returns -2, - this is the ISO C 99 and glibc-2.2 behaviour - or
  195. + not - amended ANSI C, glibc-2.1 and Solaris 2.7 behaviour. We
  196. + don't have an autoconf test for this, yet.
  197. + The new behaviour would allow us to feed the bytes one by one into
  198. + mbrtowc. But the old behaviour forces us to feed all bytes since
  199. + the end of the last character into mbrtowc. Since we want to retry
  200. + with more bytes when mbrtowc returns -2, we must backup the state
  201. + before calling mbrtowc, because implementations with the new
  202. + behaviour will clobber it. */
  203. + mbstate_t backup_state = mbf->state;
  204. +
  205. + bytes = mbrtowc (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state);
  206. +
  207. + if (bytes == (size_t) -1)
  208. + {
  209. + /* An invalid multibyte sequence was encountered. */
  210. + /* Return a single byte. */
  211. + bytes = 1;
  212. + mbc->wc_valid = false;
  213. + break;
  214. + }
  215. + else if (bytes == (size_t) -2)
  216. + {
  217. + /* An incomplete multibyte character. */
  218. + mbf->state = backup_state;
  219. + if (mbf->bufcount == MBCHAR_BUF_SIZE)
  220. + {
  221. + /* An overlong incomplete multibyte sequence was encountered. */
  222. + /* Return a single byte. */
  223. + bytes = 1;
  224. + mbc->wc_valid = false;
  225. + break;
  226. + }
  227. + else
  228. + {
  229. + /* Read one more byte and retry mbrtowc. */
  230. + int c = getc (mbf->fp);
  231. + if (c == EOF)
  232. + {
  233. + /* An incomplete multibyte character at the end. */
  234. + mbf->eof_seen = true;
  235. + bytes = mbf->bufcount;
  236. + mbc->wc_valid = false;
  237. + break;
  238. + }
  239. + mbf->buf[mbf->bufcount] = (unsigned char) c;
  240. + mbf->bufcount++;
  241. + }
  242. + }
  243. + else
  244. + {
  245. + if (bytes == 0)
  246. + {
  247. + /* A null wide character was encountered. */
  248. + bytes = 1;
  249. + assert (mbf->buf[0] == '\0');
  250. + assert (mbc->wc == 0);
  251. + }
  252. + mbc->wc_valid = true;
  253. + break;
  254. + }
  255. + }
  256. +
  257. + /* Return the multibyte sequence mbf->buf[0..bytes-1]. */
  258. + mbc->ptr = &mbc->buf[0];
  259. + memcpy (&mbc->buf[0], &mbf->buf[0], bytes);
  260. + mbc->bytes = bytes;
  261. +
  262. + mbf->bufcount -= bytes;
  263. + if (mbf->bufcount > 0)
  264. + {
  265. + /* It's not worth calling memmove() for so few bytes. */
  266. + unsigned int count = mbf->bufcount;
  267. + char *p = &mbf->buf[0];
  268. +
  269. + do
  270. + {
  271. + *p = *(p + bytes);
  272. + p++;
  273. + }
  274. + while (--count > 0);
  275. + }
  276. + return;
  277. +
  278. +eof:
  279. + /* An mbchar_t with bytes == 0 is used to indicate EOF. */
  280. + mbc->ptr = NULL;
  281. + mbc->bytes = 0;
  282. + mbc->wc_valid = false;
  283. + return;
  284. +}
  285. +
  286. +MBFILE_INLINE void
  287. +mbfile_multi_ungetc (const struct mbchar *mbc, struct mbfile_multi *mbf)
  288. +{
  289. + mb_copy (&mbf->pushback, mbc);
  290. + mbf->have_pushback = true;
  291. +}
  292. +
  293. +typedef struct mbfile_multi mb_file_t;
  294. +
  295. +typedef mbchar_t mbf_char_t;
  296. +
  297. +#define mbf_init(mbf, stream) \
  298. + ((mbf).fp = (stream), \
  299. + (mbf).eof_seen = false, \
  300. + (mbf).have_pushback = false, \
  301. + memset (&(mbf).state, '\0', sizeof (mbstate_t)), \
  302. + (mbf).bufcount = 0)
  303. +
  304. +#define mbf_getc(mbc, mbf) mbfile_multi_getc (&(mbc), &(mbf))
  305. +
  306. +#define mbf_ungetc(mbc, mbf) mbfile_multi_ungetc (&(mbc), &(mbf))
  307. +
  308. +#define mb_iseof(mbc) ((mbc).bytes == 0)
  309. +
  310. +#ifndef _GL_INLINE_HEADER_BEGIN
  311. + #error "Please include config.h first."
  312. +#endif
  313. +_GL_INLINE_HEADER_BEGIN
  314. +
  315. +#endif /* _MBFILE_H */
  316. diff -Naurp coreutils-8.27-orig/m4/mbfile.m4 coreutils-8.27/m4/mbfile.m4
  317. --- coreutils-8.27-orig/m4/mbfile.m4 1969-12-31 18:00:00.000000000 -0600
  318. +++ coreutils-8.27/m4/mbfile.m4 2017-03-11 23:47:38.070058349 -0600
  319. @@ -0,0 +1,14 @@
  320. +# mbfile.m4 serial 7
  321. +dnl Copyright (C) 2005, 2008-2017 Free Software Foundation, Inc.
  322. +dnl This file is free software; the Free Software Foundation
  323. +dnl gives unlimited permission to copy and/or distribute it,
  324. +dnl with or without modifications, as long as this notice is preserved.
  325. +
  326. +dnl autoconf tests required for use of mbfile.h
  327. +dnl From Bruno Haible.
  328. +
  329. +AC_DEFUN([gl_MBFILE],
  330. +[
  331. + AC_REQUIRE([AC_TYPE_MBSTATE_T])
  332. + :
  333. +])
  334. diff -Naurp coreutils-8.27-orig/src/cut.c coreutils-8.27/src/cut.c
  335. --- coreutils-8.27-orig/src/cut.c 2017-01-01 16:34:24.000000000 -0600
  336. +++ coreutils-8.27/src/cut.c 2017-03-11 23:47:59.526048471 -0600
  337. @@ -28,6 +28,11 @@
  338. #include <assert.h>
  339. #include <getopt.h>
  340. #include <sys/types.h>
  341. +
  342. +/* Get mbstate_t, mbrtowc(). */
  343. +#if HAVE_WCHAR_H
  344. +# include <wchar.h>
  345. +#endif
  346. #include "system.h"
  347. #include "error.h"
  348. @@ -38,6 +43,18 @@
  349. #include "set-fields.h"
  350. +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
  351. + installation; work around this configuration error. */
  352. +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
  353. +# undef MB_LEN_MAX
  354. +# define MB_LEN_MAX 16
  355. +#endif
  356. +
  357. +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
  358. +#if HAVE_MBRTOWC && defined mbstate_t
  359. +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
  360. +#endif
  361. +
  362. /* The official name of this program (e.g., no 'g' prefix). */
  363. #define PROGRAM_NAME "cut"
  364. @@ -54,6 +71,52 @@
  365. } \
  366. while (0)
  367. +/* Refill the buffer BUF to get a multibyte character. */
  368. +#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \
  369. + do \
  370. + { \
  371. + if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \
  372. + { \
  373. + memmove (BUF, BUFPOS, BUFLEN); \
  374. + BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
  375. + BUFPOS = BUF; \
  376. + } \
  377. + } \
  378. + while (0)
  379. +
  380. +/* Get wide character on BUFPOS. BUFPOS is not included after that.
  381. + If byte sequence is not valid as a character, CONVFAIL is true. Otherwise false. */
  382. +#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
  383. + do \
  384. + { \
  385. + mbstate_t state_bak; \
  386. + \
  387. + if (BUFLEN < 1) \
  388. + { \
  389. + WC = WEOF; \
  390. + break; \
  391. + } \
  392. + \
  393. + /* Get a wide character. */ \
  394. + CONVFAIL = false; \
  395. + state_bak = STATE; \
  396. + MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE); \
  397. + \
  398. + switch (MBLENGTH) \
  399. + { \
  400. + case (size_t)-1: \
  401. + case (size_t)-2: \
  402. + CONVFAIL = true; \
  403. + STATE = state_bak; \
  404. + /* Fall througn. */ \
  405. + \
  406. + case 0: \
  407. + MBLENGTH = 1; \
  408. + break; \
  409. + } \
  410. + } \
  411. + while (0)
  412. +
  413. /* Pointer inside RP. When checking if a byte or field is selected
  414. by a finite range, we check if it is between CURRENT_RP.LO
  415. @@ -61,6 +124,9 @@
  416. CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
  417. static struct field_range_pair *current_rp;
  418. +/* Length of the delimiter given as argument to -d. */
  419. +size_t delimlen;
  420. +
  421. /* This buffer is used to support the semantics of the -s option
  422. (or lack of same) when the specified field list includes (does
  423. not include) the first field. In both of those cases, the entire
  424. @@ -77,15 +143,25 @@ enum operating_mode
  425. {
  426. undefined_mode,
  427. - /* Output characters that are in the given bytes. */
  428. + /* Output bytes that are at the given positions. */
  429. byte_mode,
  430. + /* Output characters that are at the given positions. */
  431. + character_mode,
  432. +
  433. /* Output the given delimiter-separated fields. */
  434. field_mode
  435. };
  436. static enum operating_mode operating_mode;
  437. +/* If nonzero, when in byte mode, don't split multibyte characters. */
  438. +static int byte_mode_character_aware;
  439. +
  440. +/* If nonzero, the function for single byte locale is work
  441. + if this program runs on multibyte locale. */
  442. +static int force_singlebyte_mode;
  443. +
  444. /* If true do not output lines containing no delimiter characters.
  445. Otherwise, all such lines are printed. This option is valid only
  446. with field mode. */
  447. @@ -97,6 +173,9 @@ static bool complement;
  448. /* The delimiter character for field mode. */
  449. static unsigned char delim;
  450. +#if HAVE_WCHAR_H
  451. +static wchar_t wcdelim;
  452. +#endif
  453. /* The delimiter for each line/record. */
  454. static unsigned char line_delim = '\n';
  455. @@ -164,7 +243,7 @@ Print selected parts of lines from each
  456. -f, --fields=LIST select only these fields; also print any line\n\
  457. that contains no delimiter character, unless\n\
  458. the -s option is specified\n\
  459. - -n (ignored)\n\
  460. + -n with -b: don't split multibyte characters\n\
  461. "), stdout);
  462. fputs (_("\
  463. --complement complement the set of selected bytes, characters\n\
  464. @@ -280,6 +359,82 @@ cut_bytes (FILE *stream)
  465. }
  466. }
  467. +#if HAVE_MBRTOWC
  468. +/* This function is in use for the following case.
  469. +
  470. + 1. Read from the stream STREAM, printing to standard output any selected
  471. + characters.
  472. +
  473. + 2. Read from stream STREAM, printing to standard output any selected bytes,
  474. + without splitting multibyte characters. */
  475. +
  476. +static void
  477. +cut_characters_or_cut_bytes_no_split (FILE *stream)
  478. +{
  479. + size_t idx; /* number of bytes or characters in the line so far. */
  480. + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
  481. + char *bufpos; /* Next read position of BUF. */
  482. + size_t buflen; /* The length of the byte sequence in buf. */
  483. + wint_t wc; /* A gotten wide character. */
  484. + size_t mblength; /* The byte size of a multibyte character which shows
  485. + as same character as WC. */
  486. + mbstate_t state; /* State of the stream. */
  487. + bool convfail = false; /* true, when conversion failed. Otherwise false. */
  488. + /* Whether to begin printing delimiters between ranges for the current line.
  489. + Set after we've begun printing data corresponding to the first range. */
  490. + bool print_delimiter = false;
  491. +
  492. + idx = 0;
  493. + buflen = 0;
  494. + bufpos = buf;
  495. + memset (&state, '\0', sizeof(mbstate_t));
  496. +
  497. + current_rp = frp;
  498. +
  499. + while (1)
  500. + {
  501. + REFILL_BUFFER (buf, bufpos, buflen, stream);
  502. +
  503. + GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
  504. + (void) convfail; /* ignore unused */
  505. +
  506. + if (wc == WEOF)
  507. + {
  508. + if (idx > 0)
  509. + putchar (line_delim);
  510. + break;
  511. + }
  512. + else if (wc == line_delim)
  513. + {
  514. + putchar (line_delim);
  515. + idx = 0;
  516. + print_delimiter = false;
  517. + current_rp = frp;
  518. + }
  519. + else
  520. + {
  521. + next_item (&idx);
  522. + if (print_kth (idx))
  523. + {
  524. + if (output_delimiter_specified)
  525. + {
  526. + if (print_delimiter && is_range_start_index (idx))
  527. + {
  528. + fwrite (output_delimiter_string, sizeof (char),
  529. + output_delimiter_length, stdout);
  530. + }
  531. + print_delimiter = true;
  532. + }
  533. + fwrite (bufpos, mblength, sizeof(char), stdout);
  534. + }
  535. + }
  536. +
  537. + buflen -= mblength;
  538. + bufpos += mblength;
  539. + }
  540. +}
  541. +#endif
  542. +
  543. /* Read from stream STREAM, printing to standard output any selected fields. */
  544. static void
  545. @@ -425,13 +580,211 @@ cut_fields (FILE *stream)
  546. }
  547. }
  548. +#if HAVE_MBRTOWC
  549. +static void
  550. +cut_fields_mb (FILE *stream)
  551. +{
  552. + int c;
  553. + size_t field_idx;
  554. + int found_any_selected_field;
  555. + int buffer_first_field;
  556. + int empty_input;
  557. + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
  558. + char *bufpos; /* Next read position of BUF. */
  559. + size_t buflen; /* The length of the byte sequence in buf. */
  560. + wint_t wc = 0; /* A gotten wide character. */
  561. + size_t mblength; /* The byte size of a multibyte character which shows
  562. + as same character as WC. */
  563. + mbstate_t state; /* State of the stream. */
  564. + bool convfail = false; /* true, when conversion failed. Otherwise false. */
  565. +
  566. + current_rp = frp;
  567. +
  568. + found_any_selected_field = 0;
  569. + field_idx = 1;
  570. + bufpos = buf;
  571. + buflen = 0;
  572. + memset (&state, '\0', sizeof(mbstate_t));
  573. +
  574. + c = getc (stream);
  575. + empty_input = (c == EOF);
  576. + if (c != EOF)
  577. + {
  578. + ungetc (c, stream);
  579. + wc = 0;
  580. + }
  581. + else
  582. + wc = WEOF;
  583. +
  584. + /* To support the semantics of the -s flag, we may have to buffer
  585. + all of the first field to determine whether it is `delimited.'
  586. + But that is unnecessary if all non-delimited lines must be printed
  587. + and the first field has been selected, or if non-delimited lines
  588. + must be suppressed and the first field has *not* been selected.
  589. + That is because a non-delimited line has exactly one field. */
  590. + buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
  591. +
  592. + while (1)
  593. + {
  594. + if (field_idx == 1 && buffer_first_field)
  595. + {
  596. + int len = 0;
  597. +
  598. + while (1)
  599. + {
  600. + REFILL_BUFFER (buf, bufpos, buflen, stream);
  601. +
  602. + GET_NEXT_WC_FROM_BUFFER
  603. + (wc, bufpos, buflen, mblength, state, convfail);
  604. +
  605. + if (wc == WEOF)
  606. + break;
  607. +
  608. + field_1_buffer = xrealloc (field_1_buffer, len + mblength);
  609. + memcpy (field_1_buffer + len, bufpos, mblength);
  610. + len += mblength;
  611. + buflen -= mblength;
  612. + bufpos += mblength;
  613. +
  614. + if (!convfail && (wc == line_delim || wc == wcdelim))
  615. + break;
  616. + }
  617. +
  618. + if (len <= 0 && wc == WEOF)
  619. + break;
  620. +
  621. + /* If the first field extends to the end of line (it is not
  622. + delimited) and we are printing all non-delimited lines,
  623. + print this one. */
  624. + if (convfail || (!convfail && wc != wcdelim))
  625. + {
  626. + if (suppress_non_delimited)
  627. + {
  628. + /* Empty. */
  629. + }
  630. + else
  631. + {
  632. + fwrite (field_1_buffer, sizeof (char), len, stdout);
  633. + /* Make sure the output line is newline terminated. */
  634. + if (convfail || (!convfail && wc != line_delim))
  635. + putchar (line_delim);
  636. + }
  637. + continue;
  638. + }
  639. +
  640. + if (print_kth (1))
  641. + {
  642. + /* Print the field, but not the trailing delimiter. */
  643. + fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
  644. + found_any_selected_field = 1;
  645. + }
  646. + next_item (&field_idx);
  647. + }
  648. +
  649. + if (wc != WEOF)
  650. + {
  651. + if (print_kth (field_idx))
  652. + {
  653. + if (found_any_selected_field)
  654. + {
  655. + fwrite (output_delimiter_string, sizeof (char),
  656. + output_delimiter_length, stdout);
  657. + }
  658. + found_any_selected_field = 1;
  659. + }
  660. +
  661. + while (1)
  662. + {
  663. + REFILL_BUFFER (buf, bufpos, buflen, stream);
  664. +
  665. + GET_NEXT_WC_FROM_BUFFER
  666. + (wc, bufpos, buflen, mblength, state, convfail);
  667. +
  668. + if (wc == WEOF)
  669. + break;
  670. + else if (!convfail && (wc == wcdelim || wc == line_delim))
  671. + {
  672. + buflen -= mblength;
  673. + bufpos += mblength;
  674. + break;
  675. + }
  676. +
  677. + if (print_kth (field_idx))
  678. + fwrite (bufpos, mblength, sizeof(char), stdout);
  679. +
  680. + buflen -= mblength;
  681. + bufpos += mblength;
  682. + }
  683. + }
  684. +
  685. + if ((!convfail || wc == line_delim) && buflen < 1)
  686. + wc = WEOF;
  687. +
  688. + if (!convfail && wc == wcdelim)
  689. + next_item (&field_idx);
  690. + else if (wc == WEOF || (!convfail && wc == line_delim))
  691. + {
  692. + if (found_any_selected_field
  693. + || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
  694. + putchar (line_delim);
  695. + if (wc == WEOF)
  696. + break;
  697. + field_idx = 1;
  698. + current_rp = frp;
  699. + found_any_selected_field = 0;
  700. + }
  701. + }
  702. +}
  703. +#endif
  704. +
  705. static void
  706. cut_stream (FILE *stream)
  707. {
  708. - if (operating_mode == byte_mode)
  709. - cut_bytes (stream);
  710. +#if HAVE_MBRTOWC
  711. + if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
  712. + {
  713. + switch (operating_mode)
  714. + {
  715. + case byte_mode:
  716. + if (byte_mode_character_aware)
  717. + cut_characters_or_cut_bytes_no_split (stream);
  718. + else
  719. + cut_bytes (stream);
  720. + break;
  721. +
  722. + case character_mode:
  723. + cut_characters_or_cut_bytes_no_split (stream);
  724. + break;
  725. +
  726. + case field_mode:
  727. + if (delimlen == 1)
  728. + {
  729. + /* Check if we have utf8 multibyte locale, so we can use this
  730. + optimization because of uniqueness of characters, which is
  731. + not true for e.g. SJIS */
  732. + char * loc = setlocale(LC_CTYPE, NULL);
  733. + if (loc && (strstr (loc, "UTF-8") || strstr (loc, "utf-8") ||
  734. + strstr (loc, "UTF8") || strstr (loc, "utf8")))
  735. + {
  736. + cut_fields (stream);
  737. + break;
  738. + }
  739. + }
  740. + cut_fields_mb (stream);
  741. + break;
  742. +
  743. + default:
  744. + abort ();
  745. + }
  746. + }
  747. else
  748. - cut_fields (stream);
  749. +#endif
  750. + {
  751. + if (operating_mode == field_mode)
  752. + cut_fields (stream);
  753. + else
  754. + cut_bytes (stream);
  755. + }
  756. }
  757. /* Process file FILE to standard output.
  758. @@ -483,6 +836,7 @@ main (int argc, char **argv)
  759. bool ok;
  760. bool delim_specified = false;
  761. char *spec_list_string IF_LINT ( = NULL);
  762. + char mbdelim[MB_LEN_MAX + 1];
  763. initialize_main (&argc, &argv);
  764. set_program_name (argv[0]);
  765. @@ -505,7 +859,6 @@ main (int argc, char **argv)
  766. switch (optc)
  767. {
  768. case 'b':
  769. - case 'c':
  770. /* Build the byte list. */
  771. if (operating_mode != undefined_mode)
  772. FATAL_ERROR (_("only one type of list may be specified"));
  773. @@ -513,6 +866,14 @@ main (int argc, char **argv)
  774. spec_list_string = optarg;
  775. break;
  776. + case 'c':
  777. + /* Build the character list. */
  778. + if (operating_mode != undefined_mode)
  779. + FATAL_ERROR (_("only one type of list may be specified"));
  780. + operating_mode = character_mode;
  781. + spec_list_string = optarg;
  782. + break;
  783. +
  784. case 'f':
  785. /* Build the field list. */
  786. if (operating_mode != undefined_mode)
  787. @@ -524,10 +885,38 @@ main (int argc, char **argv)
  788. case 'd':
  789. /* New delimiter. */
  790. /* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */
  791. - if (optarg[0] != '\0' && optarg[1] != '\0')
  792. - FATAL_ERROR (_("the delimiter must be a single character"));
  793. - delim = optarg[0];
  794. - delim_specified = true;
  795. + {
  796. +#if HAVE_MBRTOWC
  797. + if(MB_CUR_MAX > 1)
  798. + {
  799. + mbstate_t state;
  800. +
  801. + memset (&state, '\0', sizeof(mbstate_t));
  802. + delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state);
  803. +
  804. + if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
  805. + ++force_singlebyte_mode;
  806. + else
  807. + {
  808. + delimlen = (delimlen < 1) ? 1 : delimlen;
  809. + if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
  810. + FATAL_ERROR (_("the delimiter must be a single character"));
  811. + memcpy (mbdelim, optarg, delimlen);
  812. + mbdelim[delimlen] = '\0';
  813. + if (delimlen == 1)
  814. + delim = *optarg;
  815. + }
  816. + }
  817. +
  818. + if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
  819. +#endif
  820. + {
  821. + if (optarg[0] != '\0' && optarg[1] != '\0')
  822. + FATAL_ERROR (_("the delimiter must be a single character"));
  823. + delim = (unsigned char) optarg[0];
  824. + }
  825. + delim_specified = true;
  826. + }
  827. break;
  828. case OUTPUT_DELIMITER_OPTION:
  829. @@ -540,6 +929,7 @@ main (int argc, char **argv)
  830. break;
  831. case 'n':
  832. + byte_mode_character_aware = 1;
  833. break;
  834. case 's':
  835. @@ -579,15 +969,34 @@ main (int argc, char **argv)
  836. | (complement ? SETFLD_COMPLEMENT : 0) );
  837. if (!delim_specified)
  838. - delim = '\t';
  839. + {
  840. + delim = '\t';
  841. +#ifdef HAVE_MBRTOWC
  842. + wcdelim = L'\t';
  843. + mbdelim[0] = '\t';
  844. + mbdelim[1] = '\0';
  845. + delimlen = 1;
  846. +#endif
  847. + }
  848. if (output_delimiter_string == NULL)
  849. {
  850. - static char dummy[2];
  851. - dummy[0] = delim;
  852. - dummy[1] = '\0';
  853. - output_delimiter_string = dummy;
  854. - output_delimiter_length = 1;
  855. +#ifdef HAVE_MBRTOWC
  856. + if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
  857. + {
  858. + output_delimiter_string = xstrdup(mbdelim);
  859. + output_delimiter_length = delimlen;
  860. + }
  861. +
  862. + if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
  863. +#endif
  864. + {
  865. + static char dummy[2];
  866. + dummy[0] = delim;
  867. + dummy[1] = '\0';
  868. + output_delimiter_string = dummy;
  869. + output_delimiter_length = 1;
  870. + }
  871. }
  872. if (optind == argc)
  873. diff -Naurp coreutils-8.27-orig/src/expand.c coreutils-8.27/src/expand.c
  874. --- coreutils-8.27-orig/src/expand.c 2017-02-26 15:42:25.000000000 -0600
  875. +++ coreutils-8.27/src/expand.c 2017-03-11 23:49:06.758133530 -0600
  876. @@ -37,6 +37,9 @@
  877. #include <stdio.h>
  878. #include <getopt.h>
  879. #include <sys/types.h>
  880. +
  881. +#include <mbfile.h>
  882. +
  883. #include "system.h"
  884. #include "die.h"
  885. #include "xstrndup.h"
  886. @@ -100,19 +103,41 @@ expand (void)
  887. {
  888. /* Input stream. */
  889. FILE *fp = next_file (NULL);
  890. + mb_file_t mbf;
  891. + mbf_char_t c;
  892. + /* True if the starting locale is utf8. */
  893. + bool using_utf_locale;
  894. +
  895. + /* True if the first file contains BOM header. */
  896. + bool found_bom;
  897. + using_utf_locale=check_utf_locale();
  898. if (!fp)
  899. return;
  900. + mbf_init (mbf, fp);
  901. + found_bom=check_bom(fp,&mbf);
  902. - while (true)
  903. + if (using_utf_locale == false && found_bom == true)
  904. + {
  905. + /*try using some predefined locale */
  906. +
  907. + if (set_utf_locale () != 0)
  908. {
  909. - /* Input character, or EOF. */
  910. - int c;
  911. + error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale"));
  912. + }
  913. + }
  914. +
  915. + if (found_bom == true)
  916. + {
  917. + print_bom();
  918. + }
  919. +
  920. + while (true)
  921. + {
  922. /* If true, perform translations. */
  923. bool convert = true;
  924. -
  925. /* The following variables have valid values only when CONVERT
  926. is true: */
  927. @@ -122,17 +147,48 @@ expand (void)
  928. /* Index in TAB_LIST of next tab stop to examine. */
  929. size_t tab_index = 0;
  930. -
  931. /* Convert a line of text. */
  932. do
  933. {
  934. - while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
  935. - continue;
  936. + while (true) {
  937. + mbf_getc (c, mbf);
  938. + if ((mb_iseof (c)) && (fp = next_file (fp)))
  939. + {
  940. + mbf_init (mbf, fp);
  941. + if (fp!=NULL)
  942. + {
  943. + if (check_bom(fp,&mbf)==true)
  944. + {
  945. + /*Not the first file - check BOM header*/
  946. + if (using_utf_locale==false && found_bom==false)
  947. + {
  948. + /*BOM header in subsequent file but not in the first one. */
  949. + error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
  950. + }
  951. + }
  952. + else
  953. + {
  954. + if(using_utf_locale==false && found_bom==true)
  955. + {
  956. + /*First file conatined BOM header - locale was switched to UTF
  957. + /*all subsequent files should contain BOM. */
  958. + error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
  959. + }
  960. + }
  961. + }
  962. + continue;
  963. + }
  964. + else
  965. + {
  966. + break;
  967. + }
  968. + }
  969. +
  970. if (convert)
  971. {
  972. - if (c == '\t')
  973. + if (mb_iseq (c, '\t'))
  974. {
  975. /* Column the next input tab stop is on. */
  976. uintmax_t next_tab_column;
  977. @@ -151,32 +207,34 @@ expand (void)
  978. if (putchar (' ') < 0)
  979. die (EXIT_FAILURE, errno, _("write error"));
  980. - c = ' ';
  981. + mb_setascii (&c, ' ');
  982. }
  983. - else if (c == '\b')
  984. + else if (mb_iseq (c, '\b'))
  985. {
  986. /* Go back one column, and force recalculation of the
  987. next tab stop. */
  988. column -= !!column;
  989. tab_index -= !!tab_index;
  990. }
  991. - else
  992. + /* A leading control character could make us trip over. */
  993. + else if (!mb_iscntrl (c))
  994. {
  995. - column++;
  996. + column += mb_width (c);
  997. if (!column)
  998. die (EXIT_FAILURE, 0, _("input line is too long"));
  999. }
  1000. - convert &= convert_entire_line || !! isblank (c);
  1001. + convert &= convert_entire_line || mb_isblank (c);
  1002. }
  1003. - if (c < 0)
  1004. + if (mb_iseof (c))
  1005. return;
  1006. - if (putchar (c) < 0)
  1007. + mb_putc (c, stdout);
  1008. + if (ferror (stdout))
  1009. die (EXIT_FAILURE, errno, _("write error"));
  1010. }
  1011. - while (c != '\n');
  1012. + while (!mb_iseq (c, '\n'));
  1013. }
  1014. }
  1015. diff -Naurp coreutils-8.27-orig/src/expand-common.c coreutils-8.27/src/expand-common.c
  1016. --- coreutils-8.27-orig/src/expand-common.c 2017-03-01 11:22:55.000000000 -0600
  1017. +++ coreutils-8.27/src/expand-common.c 2017-03-11 23:49:06.757133570 -0600
  1018. @@ -18,6 +18,7 @@
  1019. #include <stdio.h>
  1020. #include <sys/types.h>
  1021. +#include <mbfile.h>
  1022. #include "system.h"
  1023. #include "die.h"
  1024. #include "error.h"
  1025. @@ -105,6 +106,119 @@ set_extend_size (uintmax_t tabval)
  1026. return ok;
  1027. }
  1028. +extern int
  1029. +set_utf_locale (void)
  1030. +{
  1031. + /*try using some predefined locale */
  1032. + const char* predef_locales[] = {"C.UTF8","en_US.UTF8","en_GB.UTF8"};
  1033. +
  1034. + const int predef_locales_count=3;
  1035. + for (int i=0;i<predef_locales_count;i++)
  1036. + {
  1037. + if (setlocale(LC_ALL,predef_locales[i])!=NULL)
  1038. + {
  1039. + break;
  1040. + }
  1041. + else if (i==predef_locales_count-1)
  1042. + {
  1043. + return 1;
  1044. + error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale"));
  1045. + }
  1046. + }
  1047. + return 0;
  1048. +}
  1049. +
  1050. +extern bool
  1051. +check_utf_locale(void)
  1052. +{
  1053. + char* locale = setlocale (LC_CTYPE , NULL);
  1054. + if (locale == NULL)
  1055. + {
  1056. + return false;
  1057. + }
  1058. + else if (strcasestr(locale, "utf8") == NULL && strcasestr(locale, "utf-8") == NULL)
  1059. + {
  1060. + return false;
  1061. + }
  1062. + return true;
  1063. +}
  1064. +
  1065. +extern bool
  1066. +check_bom(FILE* fp, mb_file_t *mbf)
  1067. +{
  1068. + int c;
  1069. +
  1070. +
  1071. + c=fgetc(fp);
  1072. +
  1073. + /*test BOM header of the first file */
  1074. + mbf->bufcount=0;
  1075. + if (c == 0xEF)
  1076. + {
  1077. + c=fgetc(fp);
  1078. + }
  1079. + else
  1080. + {
  1081. + if (c != EOF)
  1082. + {
  1083. + ungetc(c,fp);
  1084. + }
  1085. + return false;
  1086. + }
  1087. +
  1088. + if (c == 0xBB)
  1089. + {
  1090. + c=fgetc(fp);
  1091. + }
  1092. + else
  1093. + {
  1094. + if ( c!= EOF )
  1095. + {
  1096. + mbf->buf[0]=(unsigned char) 0xEF;
  1097. + mbf->bufcount=1;
  1098. + ungetc(c,fp);
  1099. + return false;
  1100. + }
  1101. + else
  1102. + {
  1103. + ungetc(0xEF,fp);
  1104. + return false;
  1105. + }
  1106. + }
  1107. + if (c == 0xBF)
  1108. + {
  1109. + mbf->bufcount=0;
  1110. + return true;
  1111. + }
  1112. + else
  1113. + {
  1114. + if (c != EOF)
  1115. + {
  1116. + mbf->buf[0]=(unsigned char) 0xEF;
  1117. + mbf->buf[1]=(unsigned char) 0xBB;
  1118. + mbf->bufcount=2;
  1119. + ungetc(c,fp);
  1120. + return false;
  1121. + }
  1122. + else
  1123. + {
  1124. + mbf->buf[0]=(unsigned char) 0xEF;
  1125. + mbf->bufcount=1;
  1126. + ungetc(0xBB,fp);
  1127. + return false;
  1128. + }
  1129. + }
  1130. + return false;
  1131. +}
  1132. +
  1133. +extern void
  1134. +print_bom(void)
  1135. +{
  1136. + putc (0xEF, stdout);
  1137. + putc (0xBB, stdout);
  1138. + putc (0xBF, stdout);
  1139. +}
  1140. +
  1141. /* Add the comma or blank separated list of tab stops STOPS
  1142. to the list of tab stops. */
  1143. extern void
  1144. diff -Naurp coreutils-8.27-orig/src/expand-common.h coreutils-8.27/src/expand-common.h
  1145. --- coreutils-8.27-orig/src/expand-common.h 2017-01-01 16:34:24.000000000 -0600
  1146. +++ coreutils-8.27/src/expand-common.h 2017-03-11 23:49:06.758133530 -0600
  1147. @@ -34,6 +34,18 @@ extern size_t max_column_width;
  1148. /* The desired exit status. */
  1149. extern int exit_status;
  1150. +extern int
  1151. +set_utf_locale (void);
  1152. +
  1153. +extern bool
  1154. +check_utf_locale(void);
  1155. +
  1156. +extern bool
  1157. +check_bom(FILE* fp, mb_file_t *mbf);
  1158. +
  1159. +extern void
  1160. +print_bom(void);
  1161. +
  1162. /* Add tab stop TABVAL to the end of 'tab_list'. */
  1163. extern void
  1164. add_tab_stop (uintmax_t tabval);
  1165. diff -Naurp coreutils-8.27-orig/src/fold.c coreutils-8.27/src/fold.c
  1166. --- coreutils-8.27-orig/src/fold.c 2017-01-01 16:34:24.000000000 -0600
  1167. +++ coreutils-8.27/src/fold.c 2017-03-11 23:49:30.982169404 -0600
  1168. @@ -22,12 +22,34 @@
  1169. #include <getopt.h>
  1170. #include <sys/types.h>
  1171. +/* Get mbstate_t, mbrtowc(), wcwidth(). */
  1172. +#if HAVE_WCHAR_H
  1173. +# include <wchar.h>
  1174. +#endif
  1175. +
  1176. +/* Get iswprint(), iswblank(), wcwidth(). */
  1177. +#if HAVE_WCTYPE_H
  1178. +# include <wctype.h>
  1179. +#endif
  1180. +
  1181. #include "system.h"
  1182. #include "die.h"
  1183. #include "error.h"
  1184. #include "fadvise.h"
  1185. #include "xdectoint.h"
  1186. +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
  1187. + installation; work around this configuration error. */
  1188. +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
  1189. +# undef MB_LEN_MAX
  1190. +# define MB_LEN_MAX 16
  1191. +#endif
  1192. +
  1193. +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
  1194. +#if HAVE_MBRTOWC && defined mbstate_t
  1195. +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
  1196. +#endif
  1197. +
  1198. #define TAB_WIDTH 8
  1199. /* The official name of this program (e.g., no 'g' prefix). */
  1200. @@ -35,20 +57,41 @@
  1201. #define AUTHORS proper_name ("David MacKenzie")
  1202. +#define FATAL_ERROR(Message) \
  1203. + do \
  1204. + { \
  1205. + error (0, 0, (Message)); \
  1206. + usage (2); \
  1207. + } \
  1208. + while (0)
  1209. +
  1210. +enum operating_mode
  1211. +{
  1212. + /* Fold texts by columns that are at the given positions. */
  1213. + column_mode,
  1214. +
  1215. + /* Fold texts by bytes that are at the given positions. */
  1216. + byte_mode,
  1217. +
  1218. + /* Fold texts by characters that are at the given positions. */
  1219. + character_mode,
  1220. +};
  1221. +
  1222. +/* The argument shows current mode. (Default: column_mode) */
  1223. +static enum operating_mode operating_mode;
  1224. +
  1225. /* If nonzero, try to break on whitespace. */
  1226. static bool break_spaces;
  1227. -/* If nonzero, count bytes, not column positions. */
  1228. -static bool count_bytes;
  1229. -
  1230. /* If nonzero, at least one of the files we read was standard input. */
  1231. static bool have_read_stdin;
  1232. -static char const shortopts[] = "bsw:0::1::2::3::4::5::6::7::8::9::";
  1233. +static char const shortopts[] = "bcsw:0::1::2::3::4::5::6::7::8::9::";
  1234. static struct option const longopts[] =
  1235. {
  1236. {"bytes", no_argument, NULL, 'b'},
  1237. + {"characters", no_argument, NULL, 'c'},
  1238. {"spaces", no_argument, NULL, 's'},
  1239. {"width", required_argument, NULL, 'w'},
  1240. {GETOPT_HELP_OPTION_DECL},
  1241. @@ -76,6 +119,7 @@ Wrap input lines in each FILE, writing t
  1242. fputs (_("\
  1243. -b, --bytes count bytes rather than columns\n\
  1244. + -c, --characters count characters rather than columns\n\
  1245. -s, --spaces break at spaces\n\
  1246. -w, --width=WIDTH use WIDTH columns instead of 80\n\
  1247. "), stdout);
  1248. @@ -93,7 +137,7 @@ Wrap input lines in each FILE, writing t
  1249. static size_t
  1250. adjust_column (size_t column, char c)
  1251. {
  1252. - if (!count_bytes)
  1253. + if (operating_mode != byte_mode)
  1254. {
  1255. if (c == '\b')
  1256. {
  1257. @@ -116,30 +160,14 @@ adjust_column (size_t column, char c)
  1258. to stdout, with maximum line length WIDTH.
  1259. Return true if successful. */
  1260. -static bool
  1261. -fold_file (char const *filename, size_t width)
  1262. +static void
  1263. +fold_text (FILE *istream, size_t width, int *saved_errno)
  1264. {
  1265. - FILE *istream;
  1266. int c;
  1267. size_t column = 0; /* Screen column where next char will go. */
  1268. size_t offset_out = 0; /* Index in 'line_out' for next char. */
  1269. static char *line_out = NULL;
  1270. static size_t allocated_out = 0;
  1271. - int saved_errno;
  1272. -
  1273. - if (STREQ (filename, "-"))
  1274. - {
  1275. - istream = stdin;
  1276. - have_read_stdin = true;
  1277. - }
  1278. - else
  1279. - istream = fopen (filename, "r");
  1280. -
  1281. - if (istream == NULL)
  1282. - {
  1283. - error (0, errno, "%s", quotef (filename));
  1284. - return false;
  1285. - }
  1286. fadvise (istream, FADVISE_SEQUENTIAL);
  1287. @@ -169,6 +197,15 @@ fold_file (char const *filename, size_t
  1288. bool found_blank = false;
  1289. size_t logical_end = offset_out;
  1290. + /* If LINE_OUT has no wide character,
  1291. + put a new wide character in LINE_OUT
  1292. + if column is bigger than width. */
  1293. + if (offset_out == 0)
  1294. + {
  1295. + line_out[offset_out++] = c;
  1296. + continue;
  1297. + }
  1298. +
  1299. /* Look for the last blank. */
  1300. while (logical_end)
  1301. {
  1302. @@ -215,11 +252,220 @@ fold_file (char const *filename, size_t
  1303. line_out[offset_out++] = c;
  1304. }
  1305. - saved_errno = errno;
  1306. + *saved_errno = errno;
  1307. +
  1308. + if (offset_out)
  1309. + fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
  1310. +
  1311. +}
  1312. +
  1313. +#if HAVE_MBRTOWC
  1314. +static void
  1315. +fold_multibyte_text (FILE *istream, size_t width, int *saved_errno)
  1316. +{
  1317. + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
  1318. + size_t buflen = 0; /* The length of the byte sequence in buf. */
  1319. + char *bufpos = buf; /* Next read position of BUF. */
  1320. + wint_t wc; /* A gotten wide character. */
  1321. + size_t mblength; /* The byte size of a multibyte character which shows
  1322. + as same character as WC. */
  1323. + mbstate_t state, state_bak; /* State of the stream. */
  1324. + int convfail = 0; /* 1, when conversion is failed. Otherwise 0. */
  1325. +
  1326. + static char *line_out = NULL;
  1327. + size_t offset_out = 0; /* Index in `line_out' for next char. */
  1328. + static size_t allocated_out = 0;
  1329. +
  1330. + int increment;
  1331. + size_t column = 0;
  1332. +
  1333. + size_t last_blank_pos;
  1334. + size_t last_blank_column;
  1335. + int is_blank_seen;
  1336. + int last_blank_increment = 0;
  1337. + int is_bs_following_last_blank;
  1338. + size_t bs_following_last_blank_num;
  1339. + int is_cr_after_last_blank;
  1340. +
  1341. +#define CLEAR_FLAGS \
  1342. + do \
  1343. + { \
  1344. + last_blank_pos = 0; \
  1345. + last_blank_column = 0; \
  1346. + is_blank_seen = 0; \
  1347. + is_bs_following_last_blank = 0; \
  1348. + bs_following_last_blank_num = 0; \
  1349. + is_cr_after_last_blank = 0; \
  1350. + } \
  1351. + while (0)
  1352. +
  1353. +#define START_NEW_LINE \
  1354. + do \
  1355. + { \
  1356. + putchar ('\n'); \
  1357. + column = 0; \
  1358. + offset_out = 0; \
  1359. + CLEAR_FLAGS; \
  1360. + } \
  1361. + while (0)
  1362. +
  1363. + CLEAR_FLAGS;
  1364. + memset (&state, '\0', sizeof(mbstate_t));
  1365. +
  1366. + for (;; bufpos += mblength, buflen -= mblength)
  1367. + {
  1368. + if (buflen < MB_LEN_MAX && !feof (istream) && !ferror (istream))
  1369. + {
  1370. + memmove (buf, bufpos, buflen);
  1371. + buflen += fread (buf + buflen, sizeof(char), BUFSIZ, istream);
  1372. + bufpos = buf;
  1373. + }
  1374. +
  1375. + if (buflen < 1)
  1376. + break;
  1377. +
  1378. + /* Get a wide character. */
  1379. + state_bak = state;
  1380. + mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &state);
  1381. +
  1382. + switch (mblength)
  1383. + {
  1384. + case (size_t)-1:
  1385. + case (size_t)-2:
  1386. + convfail++;
  1387. + state = state_bak;
  1388. + /* Fall through. */
  1389. +
  1390. + case 0:
  1391. + mblength = 1;
  1392. + break;
  1393. + }
  1394. +
  1395. +rescan:
  1396. + if (convfail)
  1397. + increment = 1;
  1398. + else if (wc == L'\n')
  1399. + {
  1400. + /* preserve newline */
  1401. + fwrite (line_out, sizeof(char), offset_out, stdout);
  1402. + START_NEW_LINE;
  1403. + continue;
  1404. + }
  1405. + else if (operating_mode == byte_mode) /* byte mode */
  1406. + increment = mblength;
  1407. + else if (operating_mode == character_mode) /* character mode */
  1408. + increment = 1;
  1409. + else /* column mode */
  1410. + {
  1411. + switch (wc)
  1412. + {
  1413. + case L'\b':
  1414. + increment = (column > 0) ? -1 : 0;
  1415. + break;
  1416. +
  1417. + case L'\r':
  1418. + increment = -1 * column;
  1419. + break;
  1420. +
  1421. + case L'\t':
  1422. + increment = 8 - column % 8;
  1423. + break;
  1424. +
  1425. + default:
  1426. + increment = wcwidth (wc);
  1427. + increment = (increment < 0) ? 0 : increment;
  1428. + }
  1429. + }
  1430. +
  1431. + if (column + increment > width && break_spaces && last_blank_pos)
  1432. + {
  1433. + fwrite (line_out, sizeof(char), last_blank_pos, stdout);
  1434. + putchar ('\n');
  1435. +
  1436. + offset_out = offset_out - last_blank_pos;
  1437. + column = column - last_blank_column + ((is_cr_after_last_blank)
  1438. + ? last_blank_increment : bs_following_last_blank_num);
  1439. + memmove (line_out, line_out + last_blank_pos, offset_out);
  1440. + CLEAR_FLAGS;
  1441. + goto rescan;
  1442. + }
  1443. +
  1444. + if (column + increment > width && column != 0)
  1445. + {
  1446. + fwrite (line_out, sizeof(char), offset_out, stdout);
  1447. + START_NEW_LINE;
  1448. + goto rescan;
  1449. + }
  1450. +
  1451. + if (allocated_out < offset_out + mblength)
  1452. + {
  1453. + line_out = X2REALLOC (line_out, &allocated_out);
  1454. + }
  1455. +
  1456. + memcpy (line_out + offset_out, bufpos, mblength);
  1457. + offset_out += mblength;
  1458. + column += increment;
  1459. +
  1460. + if (is_blank_seen && !convfail && wc == L'\r')
  1461. + is_cr_after_last_blank = 1;
  1462. +
  1463. + if (is_bs_following_last_blank && !convfail && wc == L'\b')
  1464. + ++bs_following_last_blank_num;
  1465. + else
  1466. + is_bs_following_last_blank = 0;
  1467. +
  1468. + if (break_spaces && !convfail && iswblank (wc))
  1469. + {
  1470. + last_blank_pos = offset_out;
  1471. + last_blank_column = column;
  1472. + is_blank_seen = 1;
  1473. + last_blank_increment = increment;
  1474. + is_bs_following_last_blank = 1;
  1475. + bs_following_last_blank_num = 0;
  1476. + is_cr_after_last_blank = 0;
  1477. + }
  1478. + }
  1479. +
  1480. + *saved_errno = errno;
  1481. if (offset_out)
  1482. fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
  1483. +}
  1484. +#endif
  1485. +
  1486. +/* Fold file FILENAME, or standard input if FILENAME is "-",
  1487. + to stdout, with maximum line length WIDTH.
  1488. + Return 0 if successful, 1 if an error occurs. */
  1489. +
  1490. +static bool
  1491. +fold_file (char const *filename, size_t width)
  1492. +{
  1493. + FILE *istream;
  1494. + int saved_errno;
  1495. +
  1496. + if (STREQ (filename, "-"))
  1497. + {
  1498. + istream = stdin;
  1499. + have_read_stdin = 1;
  1500. + }
  1501. + else
  1502. + istream = fopen (filename, "r");
  1503. +
  1504. + if (istream == NULL)
  1505. + {
  1506. + error (0, errno, "%s", filename);
  1507. + return 1;
  1508. + }
  1509. +
  1510. + /* Define how ISTREAM is being folded. */
  1511. +#if HAVE_MBRTOWC
  1512. + if (MB_CUR_MAX > 1)
  1513. + fold_multibyte_text (istream, width, &saved_errno);
  1514. + else
  1515. +#endif
  1516. + fold_text (istream, width, &saved_errno);
  1517. +
  1518. if (ferror (istream))
  1519. {
  1520. error (0, saved_errno, "%s", quotef (filename));
  1521. @@ -252,7 +498,8 @@ main (int argc, char **argv)
  1522. atexit (close_stdout);
  1523. - break_spaces = count_bytes = have_read_stdin = false;
  1524. + operating_mode = column_mode;
  1525. + break_spaces = have_read_stdin = false;
  1526. while ((optc = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1)
  1527. {
  1528. @@ -261,7 +508,15 @@ main (int argc, char **argv)
  1529. switch (optc)
  1530. {
  1531. case 'b': /* Count bytes rather than columns. */
  1532. - count_bytes = true;
  1533. + if (operating_mode != column_mode)
  1534. + FATAL_ERROR (_("only one way of folding may be specified"));
  1535. + operating_mode = byte_mode;
  1536. + break;
  1537. +
  1538. + case 'c':
  1539. + if (operating_mode != column_mode)
  1540. + FATAL_ERROR (_("only one way of folding may be specified"));
  1541. + operating_mode = character_mode;
  1542. break;
  1543. case 's': /* Break at word boundaries. */
  1544. diff -Naurp coreutils-8.27-orig/src/join.c coreutils-8.27/src/join.c
  1545. --- coreutils-8.27-orig/src/join.c 2017-01-01 16:34:24.000000000 -0600
  1546. +++ coreutils-8.27/src/join.c 2017-03-11 23:47:13.091286290 -0600
  1547. @@ -22,19 +22,33 @@
  1548. #include <sys/types.h>
  1549. #include <getopt.h>
  1550. +/* Get mbstate_t, mbrtowc(), mbrtowc(), wcwidth(). */
  1551. +#if HAVE_WCHAR_H
  1552. +# include <wchar.h>
  1553. +#endif
  1554. +
  1555. +/* Get iswblank(), towupper. */
  1556. +#if HAVE_WCTYPE_H
  1557. +# include <wctype.h>
  1558. +#endif
  1559. +
  1560. #include "system.h"
  1561. #include "die.h"
  1562. #include "error.h"
  1563. #include "fadvise.h"
  1564. #include "hard-locale.h"
  1565. #include "linebuffer.h"
  1566. -#include "memcasecmp.h"
  1567. #include "quote.h"
  1568. #include "stdio--.h"
  1569. #include "xmemcoll.h"
  1570. #include "xstrtol.h"
  1571. #include "argmatch.h"
  1572. +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
  1573. +#if HAVE_MBRTOWC && defined mbstate_t
  1574. +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
  1575. +#endif
  1576. +
  1577. /* The official name of this program (e.g., no 'g' prefix). */
  1578. #define PROGRAM_NAME "join"
  1579. @@ -136,10 +150,12 @@ static struct outlist outlist_head;
  1580. /* Last element in 'outlist', where a new element can be added. */
  1581. static struct outlist *outlist_end = &outlist_head;
  1582. -/* Tab character separating fields. If negative, fields are separated
  1583. - by any nonempty string of blanks, otherwise by exactly one
  1584. - tab character whose value (when cast to unsigned char) equals TAB. */
  1585. -static int tab = -1;
  1586. +/* Tab character separating fields. If NULL, fields are separated
  1587. + by any nonempty string of blanks. */
  1588. +static char *tab = NULL;
  1589. +
  1590. +/* The number of bytes used for tab. */
  1591. +static size_t tablen = 0;
  1592. /* If nonzero, check that the input is correctly ordered. */
  1593. static enum
  1594. @@ -276,13 +292,14 @@ xfields (struct line *line)
  1595. if (ptr == lim)
  1596. return;
  1597. - if (0 <= tab && tab != '\n')
  1598. + if (tab != NULL)
  1599. {
  1600. + unsigned char t = tab[0];
  1601. char *sep;
  1602. - for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1)
  1603. + for (; (sep = memchr (ptr, t, lim - ptr)) != NULL; ptr = sep + 1)
  1604. extract_field (line, ptr, sep - ptr);
  1605. }
  1606. - else if (tab < 0)
  1607. + else
  1608. {
  1609. /* Skip leading blanks before the first field. */
  1610. while (field_sep (*ptr))
  1611. @@ -306,6 +323,147 @@ xfields (struct line *line)
  1612. extract_field (line, ptr, lim - ptr);
  1613. }
  1614. +#if HAVE_MBRTOWC
  1615. +static void
  1616. +xfields_multibyte (struct line *line)
  1617. +{
  1618. + char *ptr = line->buf.buffer;
  1619. + char const *lim = ptr + line->buf.length - 1;
  1620. + wchar_t wc = 0;
  1621. + size_t mblength = 1;
  1622. + mbstate_t state, state_bak;
  1623. +
  1624. + memset (&state, 0, sizeof (mbstate_t));
  1625. +
  1626. + if (ptr >= lim)
  1627. + return;
  1628. +
  1629. + if (tab != NULL)
  1630. + {
  1631. + char *sep = ptr;
  1632. + for (; ptr < lim; ptr = sep + mblength)
  1633. + {
  1634. + sep = ptr;
  1635. + while (sep < lim)
  1636. + {
  1637. + state_bak = state;
  1638. + mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
  1639. +
  1640. + if (mblength == (size_t)-1 || mblength == (size_t)-2)
  1641. + {
  1642. + mblength = 1;
  1643. + state = state_bak;
  1644. + }
  1645. + mblength = (mblength < 1) ? 1 : mblength;
  1646. +
  1647. + if (mblength == tablen && !memcmp (sep, tab, mblength))
  1648. + break;
  1649. + else
  1650. + {
  1651. + sep += mblength;
  1652. + continue;
  1653. + }
  1654. + }
  1655. +
  1656. + if (sep >= lim)
  1657. + break;
  1658. +
  1659. + extract_field (line, ptr, sep - ptr);
  1660. + }
  1661. + }
  1662. + else
  1663. + {
  1664. + /* Skip leading blanks before the first field. */
  1665. + while(ptr < lim)
  1666. + {
  1667. + state_bak = state;
  1668. + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
  1669. +
  1670. + if (mblength == (size_t)-1 || mblength == (size_t)-2)
  1671. + {
  1672. + mblength = 1;
  1673. + state = state_bak;
  1674. + break;
  1675. + }
  1676. + mblength = (mblength < 1) ? 1 : mblength;
  1677. +
  1678. + if (!iswblank(wc) && wc != '\n')
  1679. + break;
  1680. + ptr += mblength;
  1681. + }
  1682. +
  1683. + do
  1684. + {
  1685. + char *sep;
  1686. + state_bak = state;
  1687. + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
  1688. + if (mblength == (size_t)-1 || mblength == (size_t)-2)
  1689. + {
  1690. + mblength = 1;
  1691. + state = state_bak;
  1692. + break;
  1693. + }
  1694. + mblength = (mblength < 1) ? 1 : mblength;
  1695. +
  1696. + sep = ptr + mblength;
  1697. + while (sep < lim)
  1698. + {
  1699. + state_bak = state;
  1700. + mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
  1701. + if (mblength == (size_t)-1 || mblength == (size_t)-2)
  1702. + {
  1703. + mblength = 1;
  1704. + state = state_bak;
  1705. + break;
  1706. + }
  1707. + mblength = (mblength < 1) ? 1 : mblength;
  1708. +
  1709. + if (iswblank (wc) || wc == '\n')
  1710. + break;
  1711. +
  1712. + sep += mblength;
  1713. + }
  1714. +
  1715. + extract_field (line, ptr, sep - ptr);
  1716. + if (sep >= lim)
  1717. + return;
  1718. +
  1719. + state_bak = state;
  1720. + mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
  1721. + if (mblength == (size_t)-1 || mblength == (size_t)-2)
  1722. + {
  1723. + mblength = 1;
  1724. + state = state_bak;
  1725. + break;
  1726. + }
  1727. + mblength = (mblength < 1) ? 1 : mblength;
  1728. +
  1729. + ptr = sep + mblength;
  1730. + while (ptr < lim)
  1731. + {
  1732. + state_bak = state;
  1733. + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
  1734. + if (mblength == (size_t)-1 || mblength == (size_t)-2)
  1735. + {
  1736. + mblength = 1;
  1737. + state = state_bak;
  1738. + break;
  1739. + }
  1740. + mblength = (mblength < 1) ? 1 : mblength;
  1741. +
  1742. + if (!iswblank (wc) && wc != '\n')
  1743. + break;
  1744. +
  1745. + ptr += mblength;
  1746. + }
  1747. + }
  1748. + while (ptr < lim);
  1749. + }
  1750. +
  1751. + extract_field (line, ptr, lim - ptr);
  1752. +}
  1753. +#endif
  1754. +
  1755. static void
  1756. freeline (struct line *line)
  1757. {
  1758. @@ -327,56 +485,133 @@ keycmp (struct line const *line1, struct
  1759. size_t jf_1, size_t jf_2)
  1760. {
  1761. /* Start of field to compare in each file. */
  1762. - char *beg1;
  1763. - char *beg2;
  1764. -
  1765. - size_t len1;
  1766. - size_t len2; /* Length of fields to compare. */
  1767. + char *beg[2];
  1768. + char *copy[2];
  1769. + size_t len[2]; /* Length of fields to compare. */
  1770. int diff;
  1771. + int i, j;
  1772. + int mallocd = 0;
  1773. if (jf_1 < line1->nfields)
  1774. {
  1775. - beg1 = line1->fields[jf_1].beg;
  1776. - len1 = line1->fields[jf_1].len;
  1777. + beg[0] = line1->fields[jf_1].beg;
  1778. + len[0] = line1->fields[jf_1].len;
  1779. }
  1780. else
  1781. {
  1782. - beg1 = NULL;
  1783. - len1 = 0;
  1784. + beg[0] = NULL;
  1785. + len[0] = 0;
  1786. }
  1787. if (jf_2 < line2->nfields)
  1788. {
  1789. - beg2 = line2->fields[jf_2].beg;
  1790. - len2 = line2->fields[jf_2].len;
  1791. + beg[1] = line2->fields[jf_2].beg;
  1792. + len[1] = line2->fields[jf_2].len;
  1793. }
  1794. else
  1795. {
  1796. - beg2 = NULL;
  1797. - len2 = 0;
  1798. + beg[1] = NULL;
  1799. + len[1] = 0;
  1800. }
  1801. - if (len1 == 0)
  1802. - return len2 == 0 ? 0 : -1;
  1803. - if (len2 == 0)
  1804. + if (len[0] == 0)
  1805. + return len[1] == 0 ? 0 : -1;
  1806. + if (len[1] == 0)
  1807. return 1;
  1808. if (ignore_case)
  1809. {
  1810. - /* FIXME: ignore_case does not work with NLS (in particular,
  1811. - with multibyte chars). */
  1812. - diff = memcasecmp (beg1, beg2, MIN (len1, len2));
  1813. +#ifdef HAVE_MBRTOWC
  1814. + if (MB_CUR_MAX > 1)
  1815. + {
  1816. + size_t mblength;
  1817. + wchar_t wc, uwc;
  1818. + mbstate_t state, state_bak;
  1819. +
  1820. + memset (&state, '\0', sizeof (mbstate_t));
  1821. +
  1822. + for (i = 0; i < 2; i++)
  1823. + {
  1824. + mallocd = 1;
  1825. + copy[i] = xmalloc (len[i] + 1);
  1826. + memset (copy[i], '\0',len[i] + 1);
  1827. +
  1828. + for (j = 0; j < MIN (len[0], len[1]);)
  1829. + {
  1830. + state_bak = state;
  1831. + mblength = mbrtowc (&wc, beg[i] + j, len[i] - j, &state);
  1832. +
  1833. + switch (mblength)
  1834. + {
  1835. + case (size_t) -1:
  1836. + case (size_t) -2:
  1837. + state = state_bak;
  1838. + /* Fall through */
  1839. + case 0:
  1840. + mblength = 1;
  1841. + break;
  1842. +
  1843. + default:
  1844. + uwc = towupper (wc);
  1845. +
  1846. + if (uwc != wc)
  1847. + {
  1848. + mbstate_t state_wc;
  1849. + size_t mblen;
  1850. +
  1851. + memset (&state_wc, '\0', sizeof (mbstate_t));
  1852. + mblen = wcrtomb (copy[i] + j, uwc, &state_wc);
  1853. + assert (mblen != (size_t)-1);
  1854. + }
  1855. + else
  1856. + memcpy (copy[i] + j, beg[i] + j, mblength);
  1857. + }
  1858. + j += mblength;
  1859. + }
  1860. + copy[i][j] = '\0';
  1861. + }
  1862. + }
  1863. + else
  1864. +#endif
  1865. + {
  1866. + for (i = 0; i < 2; i++)
  1867. + {
  1868. + mallocd = 1;
  1869. + copy[i] = xmalloc (len[i] + 1);
  1870. +
  1871. + for (j = 0; j < MIN (len[0], len[1]); j++)
  1872. + copy[i][j] = toupper (beg[i][j]);
  1873. +
  1874. + copy[i][j] = '\0';
  1875. + }
  1876. + }
  1877. }
  1878. else
  1879. {
  1880. - if (hard_LC_COLLATE)
  1881. - return xmemcoll (beg1, len1, beg2, len2);
  1882. - diff = memcmp (beg1, beg2, MIN (len1, len2));
  1883. + copy[0] = beg[0];
  1884. + copy[1] = beg[1];
  1885. }
  1886. + if (hard_LC_COLLATE)
  1887. + {
  1888. + diff = xmemcoll ((char *) copy[0], len[0], (char *) copy[1], len[1]);
  1889. +
  1890. + if (mallocd)
  1891. + for (i = 0; i < 2; i++)
  1892. + free (copy[i]);
  1893. +
  1894. + return diff;
  1895. + }
  1896. + diff = memcmp (copy[0], copy[1], MIN (len[0], len[1]));
  1897. +
  1898. + if (mallocd)
  1899. + for (i = 0; i < 2; i++)
  1900. + free (copy[i]);
  1901. +
  1902. +
  1903. if (diff)
  1904. return diff;
  1905. - return len1 < len2 ? -1 : len1 != len2;
  1906. + return len[0] - len[1];
  1907. }
  1908. /* Check that successive input lines PREV and CURRENT from input file
  1909. @@ -468,6 +703,11 @@ get_line (FILE *fp, struct line **linep,
  1910. }
  1911. ++line_no[which - 1];
  1912. +#if HAVE_MBRTOWC
  1913. + if (MB_CUR_MAX > 1)
  1914. + xfields_multibyte (line);
  1915. + else
  1916. +#endif
  1917. xfields (line);
  1918. if (prevline[which - 1])
  1919. @@ -567,21 +807,28 @@ prfield (size_t n, struct line const *li
  1920. /* Output all the fields in line, other than the join field. */
  1921. +#define PUT_TAB_CHAR \
  1922. + do \
  1923. + { \
  1924. + (tab != NULL) ? \
  1925. + fwrite(tab, sizeof(char), tablen, stdout) : putchar (' '); \
  1926. + } \
  1927. + while (0)
  1928. +
  1929. static void
  1930. prfields (struct line const *line, size_t join_field, size_t autocount)
  1931. {
  1932. size_t i;
  1933. size_t nfields = autoformat ? autocount : line->nfields;
  1934. - char output_separator = tab < 0 ? ' ' : tab;
  1935. for (i = 0; i < join_field && i < nfields; ++i)
  1936. {
  1937. - putchar (output_separator);
  1938. + PUT_TAB_CHAR;
  1939. prfield (i, line);
  1940. }
  1941. for (i = join_field + 1; i < nfields; ++i)
  1942. {
  1943. - putchar (output_separator);
  1944. + PUT_TAB_CHAR;
  1945. prfield (i, line);
  1946. }
  1947. }
  1948. @@ -592,7 +839,6 @@ static void
  1949. prjoin (struct line const *line1, struct line const *line2)
  1950. {
  1951. const struct outlist *outlist;
  1952. - char output_separator = tab < 0 ? ' ' : tab;
  1953. size_t field;
  1954. struct line const *line;
  1955. @@ -626,7 +872,7 @@ prjoin (struct line const *line1, struct
  1956. o = o->next;
  1957. if (o == NULL)
  1958. break;
  1959. - putchar (output_separator);
  1960. + PUT_TAB_CHAR;
  1961. }
  1962. putchar (eolchar);
  1963. }
  1964. @@ -1104,20 +1350,43 @@ main (int argc, char **argv)
  1965. case 't':
  1966. {
  1967. - unsigned char newtab = optarg[0];
  1968. + char *newtab = NULL;
  1969. + size_t newtablen;
  1970. + newtab = xstrdup (optarg);
  1971. +#if HAVE_MBRTOWC
  1972. + if (MB_CUR_MAX > 1)
  1973. + {
  1974. + mbstate_t state;
  1975. +
  1976. + memset (&state, 0, sizeof (mbstate_t));
  1977. + newtablen = mbrtowc (NULL, newtab,
  1978. + strnlen (newtab, MB_LEN_MAX),
  1979. + &state);
  1980. + if (newtablen == (size_t) 0
  1981. + || newtablen == (size_t) -1
  1982. + || newtablen == (size_t) -2)
  1983. + newtablen = 1;
  1984. + }
  1985. + else
  1986. +#endif
  1987. + newtablen = 1;
  1988. if (! newtab)
  1989. - newtab = '\n'; /* '' => process the whole line. */
  1990. + newtab = (char*)"\n"; /* '' => process the whole line. */
  1991. else if (optarg[1])
  1992. {
  1993. - if (STREQ (optarg, "\\0"))
  1994. - newtab = '\0';
  1995. - else
  1996. - die (EXIT_FAILURE, 0, _("multi-character tab %s"),
  1997. - quote (optarg));
  1998. + if (newtablen == 1 && newtab[1])
  1999. + {
  2000. + if (STREQ (newtab, "\\0"))
  2001. + newtab[0] = '\0';
  2002. + }
  2003. + }
  2004. + if (tab != NULL && strcmp (tab, newtab))
  2005. + {
  2006. + free (newtab);
  2007. + die (EXIT_FAILURE, 0, _("incompatible tabs"));
  2008. }
  2009. - if (0 <= tab && tab != newtab)
  2010. - die (EXIT_FAILURE, 0, _("incompatible tabs"));
  2011. tab = newtab;
  2012. + tablen = newtablen;
  2013. }
  2014. break;
  2015. diff -Naurp coreutils-8.27-orig/src/pr.c coreutils-8.27/src/pr.c
  2016. --- coreutils-8.27-orig/src/pr.c 2017-01-01 16:34:24.000000000 -0600
  2017. +++ coreutils-8.27/src/pr.c 2017-03-11 23:47:13.094286139 -0600
  2018. @@ -311,6 +311,24 @@
  2019. #include <getopt.h>
  2020. #include <sys/types.h>
  2021. +
  2022. +/* Get MB_LEN_MAX. */
  2023. +#include <limits.h>
  2024. +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
  2025. + installation; work around this configuration error. */
  2026. +#if !defined MB_LEN_MAX || MB_LEN_MAX == 1
  2027. +# define MB_LEN_MAX 16
  2028. +#endif
  2029. +
  2030. +/* Get MB_CUR_MAX. */
  2031. +#include <stdlib.h>
  2032. +
  2033. +/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
  2034. +/* Get mbstate_t, mbrtowc(), wcwidth(). */
  2035. +#if HAVE_WCHAR_H
  2036. +# include <wchar.h>
  2037. +#endif
  2038. +
  2039. #include "system.h"
  2040. #include "die.h"
  2041. #include "error.h"
  2042. @@ -324,6 +342,18 @@
  2043. #include "xstrtol.h"
  2044. #include "xdectoint.h"
  2045. +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
  2046. +#if HAVE_MBRTOWC && defined mbstate_t
  2047. +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
  2048. +#endif
  2049. +
  2050. +#ifndef HAVE_DECL_WCWIDTH
  2051. +"this configure-time declaration test was not run"
  2052. +#endif
  2053. +#if !HAVE_DECL_WCWIDTH
  2054. +extern int wcwidth ();
  2055. +#endif
  2056. +
  2057. /* The official name of this program (e.g., no 'g' prefix). */
  2058. #define PROGRAM_NAME "pr"
  2059. @@ -416,7 +446,20 @@ struct COLUMN
  2060. typedef struct COLUMN COLUMN;
  2061. -static int char_to_clump (char c);
  2062. +/* Funtion pointers to switch functions for single byte locale or for
  2063. + multibyte locale. If multibyte functions do not exist in your sysytem,
  2064. + these pointers always point the function for single byte locale. */
  2065. +static void (*print_char) (char c);
  2066. +static int (*char_to_clump) (char c);
  2067. +
  2068. +/* Functions for single byte locale. */
  2069. +static void print_char_single (char c);
  2070. +static int char_to_clump_single (char c);
  2071. +
  2072. +/* Functions for multibyte locale. */
  2073. +static void print_char_multi (char c);
  2074. +static int char_to_clump_multi (char c);
  2075. +
  2076. static bool read_line (COLUMN *p);
  2077. static bool print_page (void);
  2078. static bool print_stored (COLUMN *p);
  2079. @@ -428,6 +471,7 @@ static void add_line_number (COLUMN *p);
  2080. static void getoptnum (const char *n_str, int min, int *num,
  2081. const char *errfmt);
  2082. static void getoptarg (char *arg, char switch_char, char *character,
  2083. + int *character_length, int *character_width,
  2084. int *number);
  2085. static void print_files (int number_of_files, char **av);
  2086. static void init_parameters (int number_of_files);
  2087. @@ -441,7 +485,6 @@ static void store_char (char c);
  2088. static void pad_down (unsigned int lines);
  2089. static void read_rest_of_line (COLUMN *p);
  2090. static void skip_read (COLUMN *p, int column_number);
  2091. -static void print_char (char c);
  2092. static void cleanup (void);
  2093. static void print_sep_string (void);
  2094. static void separator_string (const char *optarg_S);
  2095. @@ -453,7 +496,7 @@ static COLUMN *column_vector;
  2096. we store the leftmost columns contiguously in buff.
  2097. To print a line from buff, get the index of the first character
  2098. from line_vector[i], and print up to line_vector[i + 1]. */
  2099. -static char *buff;
  2100. +static unsigned char *buff;
  2101. /* Index of the position in buff where the next character
  2102. will be stored. */
  2103. @@ -557,7 +600,7 @@ static int chars_per_column;
  2104. static bool untabify_input = false;
  2105. /* (-e) The input tab character. */
  2106. -static char input_tab_char = '\t';
  2107. +static char input_tab_char[MB_LEN_MAX] = "\t";
  2108. /* (-e) Tabstops are at chars_per_tab, 2*chars_per_tab, 3*chars_per_tab, ...
  2109. where the leftmost column is 1. */
  2110. @@ -567,7 +610,10 @@ static int chars_per_input_tab = 8;
  2111. static bool tabify_output = false;
  2112. /* (-i) The output tab character. */
  2113. -static char output_tab_char = '\t';
  2114. +static char output_tab_char[MB_LEN_MAX] = "\t";
  2115. +
  2116. +/* (-i) The byte length of output tab character. */
  2117. +static int output_tab_char_length = 1;
  2118. /* (-i) The width of the output tab. */
  2119. static int chars_per_output_tab = 8;
  2120. @@ -637,7 +683,13 @@ static int line_number;
  2121. static bool numbered_lines = false;
  2122. /* (-n) Character which follows each line number. */
  2123. -static char number_separator = '\t';
  2124. +static char number_separator[MB_LEN_MAX] = "\t";
  2125. +
  2126. +/* (-n) The byte length of the character which follows each line number. */
  2127. +static int number_separator_length = 1;
  2128. +
  2129. +/* (-n) The character width of the character which follows each line number. */
  2130. +static int number_separator_width = 0;
  2131. /* (-n) line counting starts with 1st line of input file (not with 1st
  2132. line of 1st page printed). */
  2133. @@ -690,6 +742,7 @@ static bool use_col_separator = false;
  2134. -a|COLUMN|-m is a 'space' and with the -J option a 'tab'. */
  2135. static char const *col_sep_string = "";
  2136. static int col_sep_length = 0;
  2137. +static int col_sep_width = 0;
  2138. static char *column_separator = (char *) " ";
  2139. static char *line_separator = (char *) "\t";
  2140. @@ -851,6 +904,13 @@ separator_string (const char *optarg_S)
  2141. integer_overflow ();
  2142. col_sep_length = len;
  2143. col_sep_string = optarg_S;
  2144. +
  2145. +#if HAVE_MBRTOWC
  2146. + if (MB_CUR_MAX > 1)
  2147. + col_sep_width = mbswidth (col_sep_string, 0);
  2148. + else
  2149. +#endif
  2150. + col_sep_width = col_sep_length;
  2151. }
  2152. int
  2153. @@ -875,6 +935,21 @@ main (int argc, char **argv)
  2154. atexit (close_stdout);
  2155. +/* Define which functions are used, the ones for single byte locale or the ones
  2156. + for multibyte locale. */
  2157. +#if HAVE_MBRTOWC
  2158. + if (MB_CUR_MAX > 1)
  2159. + {
  2160. + print_char = print_char_multi;
  2161. + char_to_clump = char_to_clump_multi;
  2162. + }
  2163. + else
  2164. +#endif
  2165. + {
  2166. + print_char = print_char_single;
  2167. + char_to_clump = char_to_clump_single;
  2168. + }
  2169. +
  2170. n_files = 0;
  2171. file_names = (argc > 1
  2172. ? xnmalloc (argc - 1, sizeof (char *))
  2173. @@ -951,8 +1026,12 @@ main (int argc, char **argv)
  2174. break;
  2175. case 'e':
  2176. if (optarg)
  2177. - getoptarg (optarg, 'e', &input_tab_char,
  2178. - &chars_per_input_tab);
  2179. + {
  2180. + int dummy_length, dummy_width;
  2181. +
  2182. + getoptarg (optarg, 'e', input_tab_char, &dummy_length,
  2183. + &dummy_width, &chars_per_input_tab);
  2184. + }
  2185. /* Could check tab width > 0. */
  2186. untabify_input = true;
  2187. break;
  2188. @@ -965,8 +1044,12 @@ main (int argc, char **argv)
  2189. break;
  2190. case 'i':
  2191. if (optarg)
  2192. - getoptarg (optarg, 'i', &output_tab_char,
  2193. - &chars_per_output_tab);
  2194. + {
  2195. + int dummy_width;
  2196. +
  2197. + getoptarg (optarg, 'i', output_tab_char, &output_tab_char_length,
  2198. + &dummy_width, &chars_per_output_tab);
  2199. + }
  2200. /* Could check tab width > 0. */
  2201. tabify_output = true;
  2202. break;
  2203. @@ -984,8 +1067,8 @@ main (int argc, char **argv)
  2204. case 'n':
  2205. numbered_lines = true;
  2206. if (optarg)
  2207. - getoptarg (optarg, 'n', &number_separator,
  2208. - &chars_per_number);
  2209. + getoptarg (optarg, 'n', number_separator, &number_separator_length,
  2210. + &number_separator_width, &chars_per_number);
  2211. break;
  2212. case 'N':
  2213. skip_count = false;
  2214. @@ -1010,6 +1093,7 @@ main (int argc, char **argv)
  2215. /* Reset an additional input of -s, -S dominates -s */
  2216. col_sep_string = "";
  2217. col_sep_length = 0;
  2218. + col_sep_width = 0;
  2219. use_col_separator = true;
  2220. if (optarg)
  2221. separator_string (optarg);
  2222. @@ -1166,10 +1250,45 @@ getoptnum (const char *n_str, int min, i
  2223. a number. */
  2224. static void
  2225. -getoptarg (char *arg, char switch_char, char *character, int *number)
  2226. +getoptarg (char *arg, char switch_char, char *character, int *character_length,
  2227. + int *character_width, int *number)
  2228. {
  2229. if (!ISDIGIT (*arg))
  2230. - *character = *arg++;
  2231. + {
  2232. +#ifdef HAVE_MBRTOWC
  2233. + if (MB_CUR_MAX > 1) /* for multibyte locale. */
  2234. + {
  2235. + wchar_t wc;
  2236. + size_t mblength;
  2237. + int width;
  2238. + mbstate_t state = {'\0'};
  2239. +
  2240. + mblength = mbrtowc (&wc, arg, strnlen(arg, MB_LEN_MAX), &state);
  2241. +
  2242. + if (mblength == (size_t)-1 || mblength == (size_t)-2)
  2243. + {
  2244. + *character_length = 1;
  2245. + *character_width = 1;
  2246. + }
  2247. + else
  2248. + {
  2249. + *character_length = (mblength < 1) ? 1 : mblength;
  2250. + width = wcwidth (wc);
  2251. + *character_width = (width < 0) ? 0 : width;
  2252. + }
  2253. +
  2254. + strncpy (character, arg, *character_length);
  2255. + arg += *character_length;
  2256. + }
  2257. + else /* for single byte locale. */
  2258. +#endif
  2259. + {
  2260. + *character = *arg++;
  2261. + *character_length = 1;
  2262. + *character_width = 1;
  2263. + }
  2264. + }
  2265. +
  2266. if (*arg)
  2267. {
  2268. long int tmp_long;
  2269. @@ -1191,6 +1310,11 @@ static void
  2270. init_parameters (int number_of_files)
  2271. {
  2272. int chars_used_by_number = 0;
  2273. + int mb_len = 1;
  2274. +#if HAVE_MBRTOWC
  2275. + if (MB_CUR_MAX > 1)
  2276. + mb_len = MB_LEN_MAX;
  2277. +#endif
  2278. lines_per_body = lines_per_page - lines_per_header - lines_per_footer;
  2279. if (lines_per_body <= 0)
  2280. @@ -1228,7 +1352,7 @@ init_parameters (int number_of_files)
  2281. else
  2282. col_sep_string = column_separator;
  2283. - col_sep_length = 1;
  2284. + col_sep_length = col_sep_width = 1;
  2285. use_col_separator = true;
  2286. }
  2287. /* It's rather pointless to define a TAB separator with column
  2288. @@ -1258,11 +1382,11 @@ init_parameters (int number_of_files)
  2289. + TAB_WIDTH (chars_per_input_tab, chars_per_number); */
  2290. /* Estimate chars_per_text without any margin and keep it constant. */
  2291. - if (number_separator == '\t')
  2292. + if (number_separator[0] == '\t')
  2293. number_width = (chars_per_number
  2294. + TAB_WIDTH (chars_per_default_tab, chars_per_number));
  2295. else
  2296. - number_width = chars_per_number + 1;
  2297. + number_width = chars_per_number + number_separator_width;
  2298. /* The number is part of the column width unless we are
  2299. printing files in parallel. */
  2300. @@ -1271,7 +1395,7 @@ init_parameters (int number_of_files)
  2301. }
  2302. int sep_chars, useful_chars;
  2303. - if (INT_MULTIPLY_WRAPV (columns - 1, col_sep_length, &sep_chars))
  2304. + if (INT_MULTIPLY_WRAPV (columns - 1, col_sep_width, &sep_chars))
  2305. sep_chars = INT_MAX;
  2306. if (INT_SUBTRACT_WRAPV (chars_per_line - chars_used_by_number, sep_chars,
  2307. &useful_chars))
  2308. @@ -1294,7 +1418,7 @@ init_parameters (int number_of_files)
  2309. We've to use 8 as the lower limit, if we use chars_per_default_tab = 8
  2310. to expand a tab which is not an input_tab-char. */
  2311. free (clump_buff);
  2312. - clump_buff = xmalloc (MAX (8, chars_per_input_tab));
  2313. + clump_buff = xmalloc (mb_len * MAX (8, chars_per_input_tab));
  2314. }
  2315. /* Open the necessary files,
  2316. @@ -1402,7 +1526,7 @@ init_funcs (void)
  2317. /* Enlarge p->start_position of first column to use the same form of
  2318. padding_not_printed with all columns. */
  2319. - h = h + col_sep_length;
  2320. + h = h + col_sep_width;
  2321. /* This loop takes care of all but the rightmost column. */
  2322. @@ -1436,7 +1560,7 @@ init_funcs (void)
  2323. }
  2324. else
  2325. {
  2326. - h = h_next + col_sep_length;
  2327. + h = h_next + col_sep_width;
  2328. h_next = h + chars_per_column;
  2329. }
  2330. }
  2331. @@ -1727,9 +1851,9 @@ static void
  2332. align_column (COLUMN *p)
  2333. {
  2334. padding_not_printed = p->start_position;
  2335. - if (col_sep_length < padding_not_printed)
  2336. + if (col_sep_width < padding_not_printed)
  2337. {
  2338. - pad_across_to (padding_not_printed - col_sep_length);
  2339. + pad_across_to (padding_not_printed - col_sep_width);
  2340. padding_not_printed = ANYWHERE;
  2341. }
  2342. @@ -2004,13 +2128,13 @@ store_char (char c)
  2343. /* May be too generous. */
  2344. buff = X2REALLOC (buff, &buff_allocated);
  2345. }
  2346. - buff[buff_current++] = c;
  2347. + buff[buff_current++] = (unsigned char) c;
  2348. }
  2349. static void
  2350. add_line_number (COLUMN *p)
  2351. {
  2352. - int i;
  2353. + int i, j;
  2354. char *s;
  2355. int num_width;
  2356. @@ -2027,22 +2151,24 @@ add_line_number (COLUMN *p)
  2357. /* Tabification is assumed for multiple columns, also for n-separators,
  2358. but 'default n-separator = TAB' hasn't been given priority over
  2359. equal column_width also specified by POSIX. */
  2360. - if (number_separator == '\t')
  2361. + if (number_separator[0] == '\t')
  2362. {
  2363. i = number_width - chars_per_number;
  2364. while (i-- > 0)
  2365. (p->char_func) (' ');
  2366. }
  2367. else
  2368. - (p->char_func) (number_separator);
  2369. + for (j = 0; j < number_separator_length; j++)
  2370. + (p->char_func) (number_separator[j]);
  2371. }
  2372. else
  2373. /* To comply with POSIX, we avoid any expansion of default TAB
  2374. separator with a single column output. No column_width requirement
  2375. has to be considered. */
  2376. {
  2377. - (p->char_func) (number_separator);
  2378. - if (number_separator == '\t')
  2379. + for (j = 0; j < number_separator_length; j++)
  2380. + (p->char_func) (number_separator[j]);
  2381. + if (number_separator[0] == '\t')
  2382. output_position = POS_AFTER_TAB (chars_per_output_tab,
  2383. output_position);
  2384. }
  2385. @@ -2203,7 +2329,7 @@ print_white_space (void)
  2386. while (goal - h_old > 1
  2387. && (h_new = POS_AFTER_TAB (chars_per_output_tab, h_old)) <= goal)
  2388. {
  2389. - putchar (output_tab_char);
  2390. + fwrite (output_tab_char, sizeof(char), output_tab_char_length, stdout);
  2391. h_old = h_new;
  2392. }
  2393. while (++h_old <= goal)
  2394. @@ -2223,6 +2349,7 @@ print_sep_string (void)
  2395. {
  2396. char const *s = col_sep_string;
  2397. int l = col_sep_length;
  2398. + int not_space_flag;
  2399. if (separators_not_printed <= 0)
  2400. {
  2401. @@ -2234,6 +2361,7 @@ print_sep_string (void)
  2402. {
  2403. for (; separators_not_printed > 0; --separators_not_printed)
  2404. {
  2405. + not_space_flag = 0;
  2406. while (l-- > 0)
  2407. {
  2408. /* 3 types of sep_strings: spaces only, spaces and chars,
  2409. @@ -2247,12 +2375,15 @@ print_sep_string (void)
  2410. }
  2411. else
  2412. {
  2413. + not_space_flag = 1;
  2414. if (spaces_not_printed > 0)
  2415. print_white_space ();
  2416. putchar (*s++);
  2417. - ++output_position;
  2418. }
  2419. }
  2420. + if (not_space_flag)
  2421. + output_position += col_sep_width;
  2422. +
  2423. /* sep_string ends with some spaces */
  2424. if (spaces_not_printed > 0)
  2425. print_white_space ();
  2426. @@ -2280,7 +2411,7 @@ print_clump (COLUMN *p, int n, char *clu
  2427. required number of tabs and spaces. */
  2428. static void
  2429. -print_char (char c)
  2430. +print_char_single (char c)
  2431. {
  2432. if (tabify_output)
  2433. {
  2434. @@ -2304,6 +2435,74 @@ print_char (char c)
  2435. putchar (c);
  2436. }
  2437. +#ifdef HAVE_MBRTOWC
  2438. +static void
  2439. +print_char_multi (char c)
  2440. +{
  2441. + static size_t mbc_pos = 0;
  2442. + static char mbc[MB_LEN_MAX] = {'\0'};
  2443. + static mbstate_t state = {'\0'};
  2444. + mbstate_t state_bak;
  2445. + wchar_t wc;
  2446. + size_t mblength;
  2447. + int width;
  2448. +
  2449. + if (tabify_output)
  2450. + {
  2451. + state_bak = state;
  2452. + mbc[mbc_pos++] = c;
  2453. + mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
  2454. +
  2455. + while (mbc_pos > 0)
  2456. + {
  2457. + switch (mblength)
  2458. + {
  2459. + case (size_t)-2:
  2460. + state = state_bak;
  2461. + return;
  2462. +
  2463. + case (size_t)-1:
  2464. + state = state_bak;
  2465. + ++output_position;
  2466. + putchar (mbc[0]);
  2467. + memmove (mbc, mbc + 1, MB_CUR_MAX - 1);
  2468. + --mbc_pos;
  2469. + break;
  2470. +
  2471. + case 0:
  2472. + mblength = 1;
  2473. +
  2474. + default:
  2475. + if (wc == L' ')
  2476. + {
  2477. + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
  2478. + --mbc_pos;
  2479. + ++spaces_not_printed;
  2480. + return;
  2481. + }
  2482. + else if (spaces_not_printed > 0)
  2483. + print_white_space ();
  2484. +
  2485. + /* Nonprintables are assumed to have width 0, except L'\b'. */
  2486. + if ((width = wcwidth (wc)) < 1)
  2487. + {
  2488. + if (wc == L'\b')
  2489. + --output_position;
  2490. + }
  2491. + else
  2492. + output_position += width;
  2493. +
  2494. + fwrite (mbc, sizeof(char), mblength, stdout);
  2495. + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
  2496. + mbc_pos -= mblength;
  2497. + }
  2498. + }
  2499. + return;
  2500. + }
  2501. + putchar (c);
  2502. +}
  2503. +#endif
  2504. +
  2505. /* Skip to page PAGE before printing.
  2506. PAGE may be larger than total number of pages. */
  2507. @@ -2483,9 +2682,9 @@ read_line (COLUMN *p)
  2508. align_empty_cols = false;
  2509. }
  2510. - if (col_sep_length < padding_not_printed)
  2511. + if (col_sep_width < padding_not_printed)
  2512. {
  2513. - pad_across_to (padding_not_printed - col_sep_length);
  2514. + pad_across_to (padding_not_printed - col_sep_width);
  2515. padding_not_printed = ANYWHERE;
  2516. }
  2517. @@ -2555,7 +2754,7 @@ print_stored (COLUMN *p)
  2518. int i;
  2519. int line = p->current_line++;
  2520. - char *first = &buff[line_vector[line]];
  2521. + unsigned char *first = &buff[line_vector[line]];
  2522. /* FIXME
  2523. UMR: Uninitialized memory read:
  2524. * This is occurring while in:
  2525. @@ -2567,7 +2766,7 @@ print_stored (COLUMN *p)
  2526. xmalloc [xmalloc.c:94]
  2527. init_store_cols [pr.c:1648]
  2528. */
  2529. - char *last = &buff[line_vector[line + 1]];
  2530. + unsigned char *last = &buff[line_vector[line + 1]];
  2531. pad_vertically = true;
  2532. @@ -2586,9 +2785,9 @@ print_stored (COLUMN *p)
  2533. }
  2534. }
  2535. - if (col_sep_length < padding_not_printed)
  2536. + if (col_sep_width < padding_not_printed)
  2537. {
  2538. - pad_across_to (padding_not_printed - col_sep_length);
  2539. + pad_across_to (padding_not_printed - col_sep_width);
  2540. padding_not_printed = ANYWHERE;
  2541. }
  2542. @@ -2601,8 +2800,8 @@ print_stored (COLUMN *p)
  2543. if (spaces_not_printed == 0)
  2544. {
  2545. output_position = p->start_position + end_vector[line];
  2546. - if (p->start_position - col_sep_length == chars_per_margin)
  2547. - output_position -= col_sep_length;
  2548. + if (p->start_position - col_sep_width == chars_per_margin)
  2549. + output_position -= col_sep_width;
  2550. }
  2551. return true;
  2552. @@ -2621,7 +2820,7 @@ print_stored (COLUMN *p)
  2553. number of characters is 1.) */
  2554. static int
  2555. -char_to_clump (char c)
  2556. +char_to_clump_single (char c)
  2557. {
  2558. unsigned char uc = c;
  2559. char *s = clump_buff;
  2560. @@ -2631,10 +2830,10 @@ char_to_clump (char c)
  2561. int chars;
  2562. int chars_per_c = 8;
  2563. - if (c == input_tab_char)
  2564. + if (c == input_tab_char[0])
  2565. chars_per_c = chars_per_input_tab;
  2566. - if (c == input_tab_char || c == '\t')
  2567. + if (c == input_tab_char[0] || c == '\t')
  2568. {
  2569. width = TAB_WIDTH (chars_per_c, input_position);
  2570. @@ -2715,6 +2914,164 @@ char_to_clump (char c)
  2571. return chars;
  2572. }
  2573. +#ifdef HAVE_MBRTOWC
  2574. +static int
  2575. +char_to_clump_multi (char c)
  2576. +{
  2577. + static size_t mbc_pos = 0;
  2578. + static char mbc[MB_LEN_MAX] = {'\0'};
  2579. + static mbstate_t state = {'\0'};
  2580. + mbstate_t state_bak;
  2581. + wchar_t wc;
  2582. + size_t mblength;
  2583. + int wc_width;
  2584. + register char *s = clump_buff;
  2585. + register int i, j;
  2586. + char esc_buff[4];
  2587. + int width;
  2588. + int chars;
  2589. + int chars_per_c = 8;
  2590. +
  2591. + state_bak = state;
  2592. + mbc[mbc_pos++] = c;
  2593. + mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
  2594. +
  2595. + width = 0;
  2596. + chars = 0;
  2597. + while (mbc_pos > 0)
  2598. + {
  2599. + switch (mblength)
  2600. + {
  2601. + case (size_t)-2:
  2602. + state = state_bak;
  2603. + return 0;
  2604. +
  2605. + case (size_t)-1:
  2606. + state = state_bak;
  2607. + mblength = 1;
  2608. +
  2609. + if (use_esc_sequence || use_cntrl_prefix)
  2610. + {
  2611. + width = +4;
  2612. + chars = +4;
  2613. + *s++ = '\\';
  2614. + sprintf (esc_buff, "%03o", (unsigned char) mbc[0]);
  2615. + for (i = 0; i <= 2; ++i)
  2616. + *s++ = (int) esc_buff[i];
  2617. + }
  2618. + else
  2619. + {
  2620. + width += 1;
  2621. + chars += 1;
  2622. + *s++ = mbc[0];
  2623. + }
  2624. + break;
  2625. +
  2626. + case 0:
  2627. + mblength = 1;
  2628. + /* Fall through */
  2629. +
  2630. + default:
  2631. + if (memcmp (mbc, input_tab_char, mblength) == 0)
  2632. + chars_per_c = chars_per_input_tab;
  2633. +
  2634. + if (memcmp (mbc, input_tab_char, mblength) == 0 || c == '\t')
  2635. + {
  2636. + int width_inc;
  2637. +
  2638. + width_inc = TAB_WIDTH (chars_per_c, input_position);
  2639. + width += width_inc;
  2640. +
  2641. + if (untabify_input)
  2642. + {
  2643. + for (i = width_inc; i; --i)
  2644. + *s++ = ' ';
  2645. + chars += width_inc;
  2646. + }
  2647. + else
  2648. + {
  2649. + for (i = 0; i < mblength; i++)
  2650. + *s++ = mbc[i];
  2651. + chars += mblength;
  2652. + }
  2653. + }
  2654. + else if ((wc_width = wcwidth (wc)) < 1)
  2655. + {
  2656. + if (use_esc_sequence)
  2657. + {
  2658. + for (i = 0; i < mblength; i++)
  2659. + {
  2660. + width += 4;
  2661. + chars += 4;
  2662. + *s++ = '\\';
  2663. + sprintf (esc_buff, "%03o", (unsigned char) mbc[i]);
  2664. + for (j = 0; j <= 2; ++j)
  2665. + *s++ = (int) esc_buff[j];
  2666. + }
  2667. + }
  2668. + else if (use_cntrl_prefix)
  2669. + {
  2670. + if (wc < 0200)
  2671. + {
  2672. + width += 2;
  2673. + chars += 2;
  2674. + *s++ = '^';
  2675. + *s++ = wc ^ 0100;
  2676. + }
  2677. + else
  2678. + {
  2679. + for (i = 0; i < mblength; i++)
  2680. + {
  2681. + width += 4;
  2682. + chars += 4;
  2683. + *s++ = '\\';
  2684. + sprintf (esc_buff, "%03o", (unsigned char) mbc[i]);
  2685. + for (j = 0; j <= 2; ++j)
  2686. + *s++ = (int) esc_buff[j];
  2687. + }
  2688. + }
  2689. + }
  2690. + else if (wc == L'\b')
  2691. + {
  2692. + width += -1;
  2693. + chars += 1;
  2694. + *s++ = c;
  2695. + }
  2696. + else
  2697. + {
  2698. + width += 0;
  2699. + chars += mblength;
  2700. + for (i = 0; i < mblength; i++)
  2701. + *s++ = mbc[i];
  2702. + }
  2703. + }
  2704. + else
  2705. + {
  2706. + width += wc_width;
  2707. + chars += mblength;
  2708. + for (i = 0; i < mblength; i++)
  2709. + *s++ = mbc[i];
  2710. + }
  2711. + }
  2712. + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
  2713. + mbc_pos -= mblength;
  2714. + }
  2715. +
  2716. + /* Too many backspaces must put us in position 0 -- never negative. */
  2717. + if (width < 0 && input_position == 0)
  2718. + {
  2719. + chars = 0;
  2720. + input_position = 0;
  2721. + }
  2722. + else if (width < 0 && input_position <= -width)
  2723. + input_position = 0;
  2724. + else
  2725. + input_position += width;
  2726. +
  2727. + return chars;
  2728. +}
  2729. +#endif
  2730. +
  2731. /* We've just printed some files and need to clean up things before
  2732. looking for more options and printing the next batch of files.
  2733. diff -Naurp coreutils-8.27-orig/src/sort.c coreutils-8.27/src/sort.c
  2734. --- coreutils-8.27-orig/src/sort.c 2017-01-01 16:34:24.000000000 -0600
  2735. +++ coreutils-8.27/src/sort.c 2017-03-11 23:49:22.416505389 -0600
  2736. @@ -29,6 +29,14 @@
  2737. #include <sys/wait.h>
  2738. #include <signal.h>
  2739. #include <assert.h>
  2740. +#if HAVE_WCHAR_H
  2741. +# include <wchar.h>
  2742. +#endif
  2743. +/* Get isw* functions. */
  2744. +#if HAVE_WCTYPE_H
  2745. +# include <wctype.h>
  2746. +#endif
  2747. +
  2748. #include "system.h"
  2749. #include "argmatch.h"
  2750. #include "die.h"
  2751. @@ -165,14 +173,39 @@ static int decimal_point;
  2752. /* Thousands separator; if -1, then there isn't one. */
  2753. static int thousands_sep;
  2754. +/* True if -f is specified. */
  2755. +static bool folding;
  2756. +
  2757. /* Nonzero if the corresponding locales are hard. */
  2758. static bool hard_LC_COLLATE;
  2759. -#if HAVE_NL_LANGINFO
  2760. +#if HAVE_LANGINFO_CODESET
  2761. static bool hard_LC_TIME;
  2762. #endif
  2763. #define NONZERO(x) ((x) != 0)
  2764. +/* get a multibyte character's byte length. */
  2765. +#define GET_BYTELEN_OF_CHAR(LIM, PTR, MBLENGTH, STATE) \
  2766. + do \
  2767. + { \
  2768. + wchar_t wc; \
  2769. + mbstate_t state_bak; \
  2770. + \
  2771. + state_bak = STATE; \
  2772. + mblength = mbrtowc (&wc, PTR, LIM - PTR, &STATE); \
  2773. + \
  2774. + switch (MBLENGTH) \
  2775. + { \
  2776. + case (size_t)-1: \
  2777. + case (size_t)-2: \
  2778. + STATE = state_bak; \
  2779. + /* Fall through. */ \
  2780. + case 0: \
  2781. + MBLENGTH = 1; \
  2782. + } \
  2783. + } \
  2784. + while (0)
  2785. +
  2786. /* The kind of blanks for '-b' to skip in various options. */
  2787. enum blanktype { bl_start, bl_end, bl_both };
  2788. @@ -346,13 +379,11 @@ static bool reverse;
  2789. they were read if all keys compare equal. */
  2790. static bool stable;
  2791. -/* If TAB has this value, blanks separate fields. */
  2792. -enum { TAB_DEFAULT = CHAR_MAX + 1 };
  2793. -
  2794. -/* Tab character separating fields. If TAB_DEFAULT, then fields are
  2795. +/* Tab character separating fields. If tab_length is 0, then fields are
  2796. separated by the empty string between a non-blank character and a blank
  2797. character. */
  2798. -static int tab = TAB_DEFAULT;
  2799. +static char tab[MB_LEN_MAX + 1];
  2800. +static size_t tab_length = 0;
  2801. /* Flag to remove consecutive duplicate lines from the output.
  2802. Only the last of a sequence of equal lines will be output. */
  2803. @@ -811,6 +842,46 @@ reap_all (void)
  2804. reap (-1);
  2805. }
  2806. +/* Function pointers. */
  2807. +static void
  2808. +(*inittables) (void);
  2809. +static char *
  2810. +(*begfield) (const struct line*, const struct keyfield *);
  2811. +static char *
  2812. +(*limfield) (const struct line*, const struct keyfield *);
  2813. +static void
  2814. +(*skipblanks) (char **ptr, char *lim);
  2815. +static int
  2816. +(*getmonth) (char const *, size_t, char **);
  2817. +static int
  2818. +(*keycompare) (const struct line *, const struct line *);
  2819. +static int
  2820. +(*numcompare) (const char *, const char *);
  2821. +
  2822. +/* Test for white space multibyte character.
  2823. + Set LENGTH the byte length of investigated multibyte character. */
  2824. +#if HAVE_MBRTOWC
  2825. +static int
  2826. +ismbblank (const char *str, size_t len, size_t *length)
  2827. +{
  2828. + size_t mblength;
  2829. + wchar_t wc;
  2830. + mbstate_t state;
  2831. +
  2832. + memset (&state, '\0', sizeof(mbstate_t));
  2833. + mblength = mbrtowc (&wc, str, len, &state);
  2834. +
  2835. + if (mblength == (size_t)-1 || mblength == (size_t)-2)
  2836. + {
  2837. + *length = 1;
  2838. + return 0;
  2839. + }
  2840. +
  2841. + *length = (mblength < 1) ? 1 : mblength;
  2842. + return iswblank (wc) || wc == '\n';
  2843. +}
  2844. +#endif
  2845. +
  2846. /* Clean up any remaining temporary files. */
  2847. static void
  2848. @@ -1255,7 +1326,7 @@ zaptemp (char const *name)
  2849. free (node);
  2850. }
  2851. -#if HAVE_NL_LANGINFO
  2852. +#if HAVE_LANGINFO_CODESET
  2853. static int
  2854. struct_month_cmp (void const *m1, void const *m2)
  2855. @@ -1270,7 +1341,7 @@ struct_month_cmp (void const *m1, void c
  2856. /* Initialize the character class tables. */
  2857. static void
  2858. -inittables (void)
  2859. +inittables_uni (void)
  2860. {
  2861. size_t i;
  2862. @@ -1282,7 +1353,7 @@ inittables (void)
  2863. fold_toupper[i] = toupper (i);
  2864. }
  2865. -#if HAVE_NL_LANGINFO
  2866. +#if HAVE_LANGINFO_CODESET
  2867. /* If we're not in the "C" locale, read different names for months. */
  2868. if (hard_LC_TIME)
  2869. {
  2870. @@ -1364,6 +1435,84 @@ specify_nmerge (int oi, char c, char con
  2871. xstrtol_fatal (e, oi, c, long_options, s);
  2872. }
  2873. +#if HAVE_MBRTOWC
  2874. +static void
  2875. +inittables_mb (void)
  2876. +{
  2877. + int i, j, k, l;
  2878. + char *name, *s, *lc_time, *lc_ctype;
  2879. + size_t s_len, mblength;
  2880. + char mbc[MB_LEN_MAX];
  2881. + wchar_t wc, pwc;
  2882. + mbstate_t state_mb, state_wc;
  2883. +
  2884. + lc_time = setlocale (LC_TIME, "");
  2885. + if (lc_time)
  2886. + lc_time = xstrdup (lc_time);
  2887. +
  2888. + lc_ctype = setlocale (LC_CTYPE, "");
  2889. + if (lc_ctype)
  2890. + lc_ctype = xstrdup (lc_ctype);
  2891. +
  2892. + if (lc_time && lc_ctype)
  2893. + /* temporarily set LC_CTYPE to match LC_TIME, so that we can convert
  2894. + * the names of months to upper case */
  2895. + setlocale (LC_CTYPE, lc_time);
  2896. +
  2897. + for (i = 0; i < MONTHS_PER_YEAR; i++)
  2898. + {
  2899. + s = (char *) nl_langinfo (ABMON_1 + i);
  2900. + s_len = strlen (s);
  2901. + monthtab[i].name = name = (char *) xmalloc (s_len + 1);
  2902. + monthtab[i].val = i + 1;
  2903. +
  2904. + memset (&state_mb, '\0', sizeof (mbstate_t));
  2905. + memset (&state_wc, '\0', sizeof (mbstate_t));
  2906. +
  2907. + for (j = 0; j < s_len;)
  2908. + {
  2909. + if (!ismbblank (s + j, s_len - j, &mblength))
  2910. + break;
  2911. + j += mblength;
  2912. + }
  2913. +
  2914. + for (k = 0; j < s_len;)
  2915. + {
  2916. + mblength = mbrtowc (&wc, (s + j), (s_len - j), &state_mb);
  2917. + assert (mblength != (size_t)-1 && mblength != (size_t)-2);
  2918. + if (mblength == 0)
  2919. + break;
  2920. +
  2921. + pwc = towupper (wc);
  2922. + if (pwc == wc)
  2923. + {
  2924. + memcpy (mbc, s + j, mblength);
  2925. + j += mblength;
  2926. + }
  2927. + else
  2928. + {
  2929. + j += mblength;
  2930. + mblength = wcrtomb (mbc, pwc, &state_wc);
  2931. + assert (mblength != (size_t)0 && mblength != (size_t)-1);
  2932. + }
  2933. +
  2934. + for (l = 0; l < mblength; l++)
  2935. + name[k++] = mbc[l];
  2936. + }
  2937. + name[k] = '\0';
  2938. + }
  2939. + qsort ((void *) monthtab, MONTHS_PER_YEAR,
  2940. + sizeof (struct month), struct_month_cmp);
  2941. +
  2942. + if (lc_time && lc_ctype)
  2943. + /* restore the original locales */
  2944. + setlocale (LC_CTYPE, lc_ctype);
  2945. +
  2946. + free (lc_ctype);
  2947. + free (lc_time);
  2948. +}
  2949. +#endif
  2950. +
  2951. /* Specify the amount of main memory to use when sorting. */
  2952. static void
  2953. specify_sort_size (int oi, char c, char const *s)
  2954. @@ -1597,7 +1746,7 @@ buffer_linelim (struct buffer const *buf
  2955. by KEY in LINE. */
  2956. static char *
  2957. -begfield (struct line const *line, struct keyfield const *key)
  2958. +begfield_uni (const struct line *line, const struct keyfield *key)
  2959. {
  2960. char *ptr = line->text, *lim = ptr + line->length - 1;
  2961. size_t sword = key->sword;
  2962. @@ -1606,10 +1755,10 @@ begfield (struct line const *line, struc
  2963. /* The leading field separator itself is included in a field when -t
  2964. is absent. */
  2965. - if (tab != TAB_DEFAULT)
  2966. + if (tab_length)
  2967. while (ptr < lim && sword--)
  2968. {
  2969. - while (ptr < lim && *ptr != tab)
  2970. + while (ptr < lim && *ptr != tab[0])
  2971. ++ptr;
  2972. if (ptr < lim)
  2973. ++ptr;
  2974. @@ -1635,11 +1784,70 @@ begfield (struct line const *line, struc
  2975. return ptr;
  2976. }
  2977. +#if HAVE_MBRTOWC
  2978. +static char *
  2979. +begfield_mb (const struct line *line, const struct keyfield *key)
  2980. +{
  2981. + int i;
  2982. + char *ptr = line->text, *lim = ptr + line->length - 1;
  2983. + size_t sword = key->sword;
  2984. + size_t schar = key->schar;
  2985. + size_t mblength;
  2986. + mbstate_t state;
  2987. +
  2988. + memset (&state, '\0', sizeof(mbstate_t));
  2989. +
  2990. + if (tab_length)
  2991. + while (ptr < lim && sword--)
  2992. + {
  2993. + while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
  2994. + {
  2995. + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
  2996. + ptr += mblength;
  2997. + }
  2998. + if (ptr < lim)
  2999. + {
  3000. + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
  3001. + ptr += mblength;
  3002. + }
  3003. + }
  3004. + else
  3005. + while (ptr < lim && sword--)
  3006. + {
  3007. + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
  3008. + ptr += mblength;
  3009. + if (ptr < lim)
  3010. + {
  3011. + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
  3012. + ptr += mblength;
  3013. + }
  3014. + while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
  3015. + ptr += mblength;
  3016. + }
  3017. +
  3018. + if (key->skipsblanks)
  3019. + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
  3020. + ptr += mblength;
  3021. +
  3022. + for (i = 0; i < schar; i++)
  3023. + {
  3024. + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
  3025. +
  3026. + if (ptr + mblength > lim)
  3027. + break;
  3028. + else
  3029. + ptr += mblength;
  3030. + }
  3031. +
  3032. + return ptr;
  3033. +}
  3034. +#endif
  3035. +
  3036. /* Return the limit of (a pointer to the first character after) the field
  3037. in LINE specified by KEY. */
  3038. static char *
  3039. -limfield (struct line const *line, struct keyfield const *key)
  3040. +limfield_uni (const struct line *line, const struct keyfield *key)
  3041. {
  3042. char *ptr = line->text, *lim = ptr + line->length - 1;
  3043. size_t eword = key->eword, echar = key->echar;
  3044. @@ -1654,10 +1862,10 @@ limfield (struct line const *line, struc
  3045. 'beginning' is the first character following the delimiting TAB.
  3046. Otherwise, leave PTR pointing at the first 'blank' character after
  3047. the preceding field. */
  3048. - if (tab != TAB_DEFAULT)
  3049. + if (tab_length)
  3050. while (ptr < lim && eword--)
  3051. {
  3052. - while (ptr < lim && *ptr != tab)
  3053. + while (ptr < lim && *ptr != tab[0])
  3054. ++ptr;
  3055. if (ptr < lim && (eword || echar))
  3056. ++ptr;
  3057. @@ -1703,10 +1911,10 @@ limfield (struct line const *line, struc
  3058. */
  3059. /* Make LIM point to the end of (one byte past) the current field. */
  3060. - if (tab != TAB_DEFAULT)
  3061. + if (tab_length)
  3062. {
  3063. char *newlim;
  3064. - newlim = memchr (ptr, tab, lim - ptr);
  3065. + newlim = memchr (ptr, tab[0], lim - ptr);
  3066. if (newlim)
  3067. lim = newlim;
  3068. }
  3069. @@ -1737,6 +1945,130 @@ limfield (struct line const *line, struc
  3070. return ptr;
  3071. }
  3072. +#if HAVE_MBRTOWC
  3073. +static char *
  3074. +limfield_mb (const struct line *line, const struct keyfield *key)
  3075. +{
  3076. + char *ptr = line->text, *lim = ptr + line->length - 1;
  3077. + size_t eword = key->eword, echar = key->echar;
  3078. + int i;
  3079. + size_t mblength;
  3080. + mbstate_t state;
  3081. +
  3082. + if (echar == 0)
  3083. + eword++; /* skip all of end field. */
  3084. +
  3085. + memset (&state, '\0', sizeof(mbstate_t));
  3086. +
  3087. + if (tab_length)
  3088. + while (ptr < lim && eword--)
  3089. + {
  3090. + while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
  3091. + {
  3092. + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
  3093. + ptr += mblength;
  3094. + }
  3095. + if (ptr < lim && (eword | echar))
  3096. + {
  3097. + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
  3098. + ptr += mblength;
  3099. + }
  3100. + }
  3101. + else
  3102. + while (ptr < lim && eword--)
  3103. + {
  3104. + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
  3105. + ptr += mblength;
  3106. + if (ptr < lim)
  3107. + {
  3108. + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
  3109. + ptr += mblength;
  3110. + }
  3111. + while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
  3112. + ptr += mblength;
  3113. + }
  3114. +
  3115. +
  3116. +# ifdef POSIX_UNSPECIFIED
  3117. + /* Make LIM point to the end of (one byte past) the current field. */
  3118. + if (tab_length)
  3119. + {
  3120. + char *newlim, *p;
  3121. +
  3122. + newlim = NULL;
  3123. + for (p = ptr; p < lim;)
  3124. + {
  3125. + if (memcmp (p, tab, tab_length) == 0)
  3126. + {
  3127. + newlim = p;
  3128. + break;
  3129. + }
  3130. +
  3131. + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
  3132. + p += mblength;
  3133. + }
  3134. + }
  3135. + else
  3136. + {
  3137. + char *newlim;
  3138. + newlim = ptr;
  3139. +
  3140. + while (newlim < lim && ismbblank (newlim, lim - newlim, &mblength))
  3141. + newlim += mblength;
  3142. + if (ptr < lim)
  3143. + {
  3144. + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
  3145. + ptr += mblength;
  3146. + }
  3147. + while (newlim < lim && !ismbblank (newlim, lim - newlim, &mblength))
  3148. + newlim += mblength;
  3149. + lim = newlim;
  3150. + }
  3151. +# endif
  3152. +
  3153. + if (echar != 0)
  3154. + {
  3155. + /* If we're skipping leading blanks, don't start counting characters
  3156. + * until after skipping past any leading blanks. */
  3157. + if (key->skipeblanks)
  3158. + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
  3159. + ptr += mblength;
  3160. +
  3161. + memset (&state, '\0', sizeof(mbstate_t));
  3162. +
  3163. + /* Advance PTR by ECHAR (if possible), but no further than LIM. */
  3164. + for (i = 0; i < echar; i++)
  3165. + {
  3166. + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
  3167. +
  3168. + if (ptr + mblength > lim)
  3169. + break;
  3170. + else
  3171. + ptr += mblength;
  3172. + }
  3173. + }
  3174. +
  3175. + return ptr;
  3176. +}
  3177. +#endif
  3178. +
  3179. +static void
  3180. +skipblanks_uni (char **ptr, char *lim)
  3181. +{
  3182. + while (*ptr < lim && blanks[to_uchar (**ptr)])
  3183. + ++(*ptr);
  3184. +}
  3185. +
  3186. +#if HAVE_MBRTOWC
  3187. +static void
  3188. +skipblanks_mb (char **ptr, char *lim)
  3189. +{
  3190. + size_t mblength;
  3191. + while (*ptr < lim && ismbblank (*ptr, lim - *ptr, &mblength))
  3192. + (*ptr) += mblength;
  3193. +}
  3194. +#endif
  3195. +
  3196. /* Fill BUF reading from FP, moving buf->left bytes from the end
  3197. of buf->buf to the beginning first. If EOF is reached and the
  3198. file wasn't terminated by a newline, supply one. Set up BUF's line
  3199. @@ -1823,8 +2155,22 @@ fillbuf (struct buffer *buf, FILE *fp, c
  3200. else
  3201. {
  3202. if (key->skipsblanks)
  3203. - while (blanks[to_uchar (*line_start)])
  3204. - line_start++;
  3205. + {
  3206. +#if HAVE_MBRTOWC
  3207. + if (MB_CUR_MAX > 1)
  3208. + {
  3209. + size_t mblength;
  3210. + while (line_start < line->keylim &&
  3211. + ismbblank (line_start,
  3212. + line->keylim - line_start,
  3213. + &mblength))
  3214. + line_start += mblength;
  3215. + }
  3216. + else
  3217. +#endif
  3218. + while (blanks[to_uchar (*line_start)])
  3219. + line_start++;
  3220. + }
  3221. line->keybeg = line_start;
  3222. }
  3223. }
  3224. @@ -1958,12 +2304,10 @@ find_unit_order (char const *number)
  3225. <none/unknown> < K/k < M < G < T < P < E < Z < Y */
  3226. static int
  3227. -human_numcompare (char const *a, char const *b)
  3228. +human_numcompare (char *a, char *b)
  3229. {
  3230. - while (blanks[to_uchar (*a)])
  3231. - a++;
  3232. - while (blanks[to_uchar (*b)])
  3233. - b++;
  3234. + skipblanks(&a, a + strlen(a));
  3235. + skipblanks(&b, b + strlen(b));
  3236. int diff = find_unit_order (a) - find_unit_order (b);
  3237. return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
  3238. @@ -1974,7 +2318,7 @@ human_numcompare (char const *a, char co
  3239. hideously fast. */
  3240. static int
  3241. -numcompare (char const *a, char const *b)
  3242. +numcompare_uni (const char *a, const char *b)
  3243. {
  3244. while (blanks[to_uchar (*a)])
  3245. a++;
  3246. @@ -1984,6 +2328,25 @@ numcompare (char const *a, char const *b
  3247. return strnumcmp (a, b, decimal_point, thousands_sep);
  3248. }
  3249. +#if HAVE_MBRTOWC
  3250. +static int
  3251. +numcompare_mb (const char *a, const char *b)
  3252. +{
  3253. + size_t mblength, len;
  3254. + len = strlen (a); /* okay for UTF-8 */
  3255. + while (*a && ismbblank (a, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
  3256. + {
  3257. + a += mblength;
  3258. + len -= mblength;
  3259. + }
  3260. + len = strlen (b); /* okay for UTF-8 */
  3261. + while (*b && ismbblank (b, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
  3262. + b += mblength;
  3263. +
  3264. + return strnumcmp (a, b, decimal_point, thousands_sep);
  3265. +}
  3266. +#endif /* HAV_EMBRTOWC */
  3267. +
  3268. /* Work around a problem whereby the long double value returned by glibc's
  3269. strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
  3270. A and B before calling strtold. FIXME: remove this function once
  3271. @@ -2034,7 +2397,7 @@ general_numcompare (char const *sa, char
  3272. Return 0 if the name in S is not recognized. */
  3273. static int
  3274. -getmonth (char const *month, char **ea)
  3275. +getmonth_uni (char const *month, size_t len, char **ea)
  3276. {
  3277. size_t lo = 0;
  3278. size_t hi = MONTHS_PER_YEAR;
  3279. @@ -2310,15 +2673,14 @@ debug_key (struct line const *line, stru
  3280. char saved = *lim;
  3281. *lim = '\0';
  3282. - while (blanks[to_uchar (*beg)])
  3283. - beg++;
  3284. + skipblanks (&beg, lim);
  3285. char *tighter_lim = beg;
  3286. if (lim < beg)
  3287. tighter_lim = lim;
  3288. else if (key->month)
  3289. - getmonth (beg, &tighter_lim);
  3290. + getmonth (beg, lim-beg, &tighter_lim);
  3291. else if (key->general_numeric)
  3292. ignore_value (strtold (beg, &tighter_lim));
  3293. else if (key->numeric || key->human_numeric)
  3294. @@ -2452,7 +2814,7 @@ key_warnings (struct keyfield const *gke
  3295. /* Warn about significant leading blanks. */
  3296. bool implicit_skip = key_numeric (key) || key->month;
  3297. bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
  3298. - if (!zero_width && !gkey_only && tab == TAB_DEFAULT && !line_offset
  3299. + if (!zero_width && !gkey_only && !tab_length && !line_offset
  3300. && ((!key->skipsblanks && !implicit_skip)
  3301. || (!key->skipsblanks && key->schar)
  3302. || (!key->skipeblanks && key->echar)))
  3303. @@ -2510,11 +2872,87 @@ key_warnings (struct keyfield const *gke
  3304. error (0, 0, _("option '-r' only applies to last-resort comparison"));
  3305. }
  3306. +#if HAVE_MBRTOWC
  3307. +static int
  3308. +getmonth_mb (const char *s, size_t len, char **ea)
  3309. +{
  3310. + char *month;
  3311. + register size_t i;
  3312. + register int lo = 0, hi = MONTHS_PER_YEAR, result;
  3313. + char *tmp;
  3314. + size_t wclength, mblength;
  3315. + const char *pp;
  3316. + const wchar_t *wpp;
  3317. + wchar_t *month_wcs;
  3318. + mbstate_t state;
  3319. +
  3320. + while (len > 0 && ismbblank (s, len, &mblength))
  3321. + {
  3322. + s += mblength;
  3323. + len -= mblength;
  3324. + }
  3325. +
  3326. + if (len == 0)
  3327. + return 0;
  3328. +
  3329. + if (SIZE_MAX - len < 1)
  3330. + xalloc_die ();
  3331. +
  3332. + month = (char *) xnmalloc (len + 1, MB_CUR_MAX);
  3333. +
  3334. + pp = tmp = (char *) xnmalloc (len + 1, MB_CUR_MAX);
  3335. + memcpy (tmp, s, len);
  3336. + tmp[len] = '\0';
  3337. + wpp = month_wcs = (wchar_t *) xnmalloc (len + 1, sizeof (wchar_t));
  3338. + memset (&state, '\0', sizeof (mbstate_t));
  3339. +
  3340. + wclength = mbsrtowcs (month_wcs, &pp, len + 1, &state);
  3341. + if (wclength == (size_t)-1 || pp != NULL)
  3342. + error (SORT_FAILURE, 0, _("Invalid multibyte input %s."), quote(s));
  3343. +
  3344. + for (i = 0; i < wclength; i++)
  3345. + {
  3346. + month_wcs[i] = towupper(month_wcs[i]);
  3347. + if (iswblank (month_wcs[i]))
  3348. + {
  3349. + month_wcs[i] = L'\0';
  3350. + break;
  3351. + }
  3352. + }
  3353. +
  3354. + mblength = wcsrtombs (month, &wpp, (len + 1) * MB_CUR_MAX, &state);
  3355. + assert (mblength != (-1) && wpp == NULL);
  3356. +
  3357. + do
  3358. + {
  3359. + int ix = (lo + hi) / 2;
  3360. +
  3361. + if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0)
  3362. + hi = ix;
  3363. + else
  3364. + lo = ix;
  3365. + }
  3366. + while (hi - lo > 1);
  3367. +
  3368. + result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name))
  3369. + ? monthtab[lo].val : 0);
  3370. +
  3371. + if (ea && result)
  3372. + *ea = (char*) s + strlen (monthtab[lo].name);
  3373. +
  3374. + free (month);
  3375. + free (tmp);
  3376. + free (month_wcs);
  3377. +
  3378. + return result;
  3379. +}
  3380. +#endif
  3381. +
  3382. /* Compare two lines A and B trying every key in sequence until there
  3383. are no more keys or a difference is found. */
  3384. static int
  3385. -keycompare (struct line const *a, struct line const *b)
  3386. +keycompare_uni (const struct line *a, const struct line *b)
  3387. {
  3388. struct keyfield *key = keylist;
  3389. @@ -2599,7 +3037,7 @@ keycompare (struct line const *a, struct
  3390. else if (key->human_numeric)
  3391. diff = human_numcompare (ta, tb);
  3392. else if (key->month)
  3393. - diff = getmonth (ta, NULL) - getmonth (tb, NULL);
  3394. + diff = getmonth (ta, tlena, NULL) - getmonth (tb, tlenb, NULL);
  3395. else if (key->random)
  3396. diff = compare_random (ta, tlena, tb, tlenb);
  3397. else if (key->version)
  3398. @@ -2715,6 +3153,211 @@ keycompare (struct line const *a, struct
  3399. return key->reverse ? -diff : diff;
  3400. }
  3401. +#if HAVE_MBRTOWC
  3402. +static int
  3403. +keycompare_mb (const struct line *a, const struct line *b)
  3404. +{
  3405. + struct keyfield *key = keylist;
  3406. +
  3407. + /* For the first iteration only, the key positions have been
  3408. + precomputed for us. */
  3409. + char *texta = a->keybeg;
  3410. + char *textb = b->keybeg;
  3411. + char *lima = a->keylim;
  3412. + char *limb = b->keylim;
  3413. +
  3414. + size_t mblength_a, mblength_b;
  3415. + wchar_t wc_a, wc_b;
  3416. + mbstate_t state_a, state_b;
  3417. +
  3418. + int diff = 0;
  3419. +
  3420. + memset (&state_a, '\0', sizeof(mbstate_t));
  3421. + memset (&state_b, '\0', sizeof(mbstate_t));
  3422. + /* Ignore keys with start after end. */
  3423. + if (a->keybeg - a->keylim > 0)
  3424. + return 0;
  3425. +
  3426. +
  3427. + /* Ignore and/or translate chars before comparing. */
  3428. +# define IGNORE_CHARS(NEW_LEN, LEN, TEXT, COPY, WC, MBLENGTH, STATE) \
  3429. + do \
  3430. + { \
  3431. + wchar_t uwc; \
  3432. + char mbc[MB_LEN_MAX]; \
  3433. + mbstate_t state_wc; \
  3434. + \
  3435. + for (NEW_LEN = i = 0; i < LEN;) \
  3436. + { \
  3437. + mbstate_t state_bak; \
  3438. + \
  3439. + state_bak = STATE; \
  3440. + MBLENGTH = mbrtowc (&WC, TEXT + i, LEN - i, &STATE); \
  3441. + \
  3442. + if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1 \
  3443. + || MBLENGTH == 0) \
  3444. + { \
  3445. + if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1) \
  3446. + STATE = state_bak; \
  3447. + if (!ignore) \
  3448. + COPY[NEW_LEN++] = TEXT[i]; \
  3449. + i++; \
  3450. + continue; \
  3451. + } \
  3452. + \
  3453. + if (ignore) \
  3454. + { \
  3455. + if ((ignore == nonprinting && !iswprint (WC)) \
  3456. + || (ignore == nondictionary \
  3457. + && !iswalnum (WC) && !iswblank (WC))) \
  3458. + { \
  3459. + i += MBLENGTH; \
  3460. + continue; \
  3461. + } \
  3462. + } \
  3463. + \
  3464. + if (translate) \
  3465. + { \
  3466. + \
  3467. + uwc = towupper(WC); \
  3468. + if (WC == uwc) \
  3469. + { \
  3470. + memcpy (mbc, TEXT + i, MBLENGTH); \
  3471. + i += MBLENGTH; \
  3472. + } \
  3473. + else \
  3474. + { \
  3475. + i += MBLENGTH; \
  3476. + WC = uwc; \
  3477. + memset (&state_wc, '\0', sizeof (mbstate_t)); \
  3478. + \
  3479. + MBLENGTH = wcrtomb (mbc, WC, &state_wc); \
  3480. + assert (MBLENGTH != (size_t)-1 && MBLENGTH != 0); \
  3481. + } \
  3482. + \
  3483. + for (j = 0; j < MBLENGTH; j++) \
  3484. + COPY[NEW_LEN++] = mbc[j]; \
  3485. + } \
  3486. + else \
  3487. + for (j = 0; j < MBLENGTH; j++) \
  3488. + COPY[NEW_LEN++] = TEXT[i++]; \
  3489. + } \
  3490. + COPY[NEW_LEN] = '\0'; \
  3491. + } \
  3492. + while (0)
  3493. +
  3494. + /* Actually compare the fields. */
  3495. +
  3496. + for (;;)
  3497. + {
  3498. + /* Find the lengths. */
  3499. + size_t lena = lima <= texta ? 0 : lima - texta;
  3500. + size_t lenb = limb <= textb ? 0 : limb - textb;
  3501. +
  3502. + char enda IF_LINT (= 0);
  3503. + char endb IF_LINT (= 0);
  3504. +
  3505. + char const *translate = key->translate;
  3506. + bool const *ignore = key->ignore;
  3507. +
  3508. + if (ignore || translate)
  3509. + {
  3510. + if (SIZE_MAX - lenb - 2 < lena)
  3511. + xalloc_die ();
  3512. + char *copy_a = (char *) xnmalloc (lena + lenb + 2, MB_CUR_MAX);
  3513. + char *copy_b = copy_a + lena * MB_CUR_MAX + 1;
  3514. + size_t new_len_a, new_len_b;
  3515. + size_t i, j;
  3516. +
  3517. + IGNORE_CHARS (new_len_a, lena, texta, copy_a,
  3518. + wc_a, mblength_a, state_a);
  3519. + IGNORE_CHARS (new_len_b, lenb, textb, copy_b,
  3520. + wc_b, mblength_b, state_b);
  3521. + texta = copy_a; textb = copy_b;
  3522. + lena = new_len_a; lenb = new_len_b;
  3523. + }
  3524. + else
  3525. + {
  3526. + /* Use the keys in-place, temporarily null-terminated. */
  3527. + enda = texta[lena]; texta[lena] = '\0';
  3528. + endb = textb[lenb]; textb[lenb] = '\0';
  3529. + }
  3530. +
  3531. + if (key->random)
  3532. + diff = compare_random (texta, lena, textb, lenb);
  3533. + else if (key->numeric | key->general_numeric | key->human_numeric)
  3534. + {
  3535. + char savea = *lima, saveb = *limb;
  3536. +
  3537. + *lima = *limb = '\0';
  3538. + diff = (key->numeric ? numcompare (texta, textb)
  3539. + : key->general_numeric ? general_numcompare (texta, textb)
  3540. + : human_numcompare (texta, textb));
  3541. + *lima = savea, *limb = saveb;
  3542. + }
  3543. + else if (key->version)
  3544. + diff = filevercmp (texta, textb);
  3545. + else if (key->month)
  3546. + diff = getmonth (texta, lena, NULL) - getmonth (textb, lenb, NULL);
  3547. + else if (lena == 0)
  3548. + diff = - NONZERO (lenb);
  3549. + else if (lenb == 0)
  3550. + diff = 1;
  3551. + else if (hard_LC_COLLATE && !folding)
  3552. + {
  3553. + diff = xmemcoll0 (texta, lena + 1, textb, lenb + 1);
  3554. + }
  3555. + else
  3556. + {
  3557. + diff = memcmp (texta, textb, MIN (lena, lenb));
  3558. + if (diff == 0)
  3559. + diff = lena < lenb ? -1 : lena != lenb;
  3560. + }
  3561. +
  3562. + if (ignore || translate)
  3563. + free (texta);
  3564. + else
  3565. + {
  3566. + texta[lena] = enda;
  3567. + textb[lenb] = endb;
  3568. + }
  3569. +
  3570. + if (diff)
  3571. + goto not_equal;
  3572. +
  3573. + key = key->next;
  3574. + if (! key)
  3575. + break;
  3576. +
  3577. + /* Find the beginning and limit of the next field. */
  3578. + if (key->eword != -1)
  3579. + lima = limfield (a, key), limb = limfield (b, key);
  3580. + else
  3581. + lima = a->text + a->length - 1, limb = b->text + b->length - 1;
  3582. +
  3583. + if (key->sword != -1)
  3584. + texta = begfield (a, key), textb = begfield (b, key);
  3585. + else
  3586. + {
  3587. + texta = a->text, textb = b->text;
  3588. + if (key->skipsblanks)
  3589. + {
  3590. + while (texta < lima && ismbblank (texta, lima - texta, &mblength_a))
  3591. + texta += mblength_a;
  3592. + while (textb < limb && ismbblank (textb, limb - textb, &mblength_b))
  3593. + textb += mblength_b;
  3594. + }
  3595. + }
  3596. + }
  3597. +
  3598. +not_equal:
  3599. + if (key && key->reverse)
  3600. + return -diff;
  3601. + else
  3602. + return diff;
  3603. +}
  3604. +#endif
  3605. +
  3606. /* Compare two lines A and B, returning negative, zero, or positive
  3607. depending on whether A compares less than, equal to, or greater than B. */
  3608. @@ -2742,7 +3385,7 @@ compare (struct line const *a, struct li
  3609. diff = - NONZERO (blen);
  3610. else if (blen == 0)
  3611. diff = 1;
  3612. - else if (hard_LC_COLLATE)
  3613. + else if (hard_LC_COLLATE && !folding)
  3614. {
  3615. /* Note xmemcoll0 is a performance enhancement as
  3616. it will not unconditionally write '\0' after the
  3617. @@ -4139,6 +4782,7 @@ set_ordering (char const *s, struct keyf
  3618. break;
  3619. case 'f':
  3620. key->translate = fold_toupper;
  3621. + folding = true;
  3622. break;
  3623. case 'g':
  3624. key->general_numeric = true;
  3625. @@ -4218,7 +4862,7 @@ main (int argc, char **argv)
  3626. initialize_exit_failure (SORT_FAILURE);
  3627. hard_LC_COLLATE = hard_locale (LC_COLLATE);
  3628. -#if HAVE_NL_LANGINFO
  3629. +#if HAVE_LANGINFO_CODESET
  3630. hard_LC_TIME = hard_locale (LC_TIME);
  3631. #endif
  3632. @@ -4239,6 +4883,29 @@ main (int argc, char **argv)
  3633. thousands_sep = -1;
  3634. }
  3635. +#if HAVE_MBRTOWC
  3636. + if (MB_CUR_MAX > 1)
  3637. + {
  3638. + inittables = inittables_mb;
  3639. + begfield = begfield_mb;
  3640. + limfield = limfield_mb;
  3641. + skipblanks = skipblanks_mb;
  3642. + getmonth = getmonth_mb;
  3643. + keycompare = keycompare_mb;
  3644. + numcompare = numcompare_mb;
  3645. + }
  3646. + else
  3647. +#endif
  3648. + {
  3649. + inittables = inittables_uni;
  3650. + begfield = begfield_uni;
  3651. + limfield = limfield_uni;
  3652. + skipblanks = skipblanks_uni;
  3653. + getmonth = getmonth_uni;
  3654. + keycompare = keycompare_uni;
  3655. + numcompare = numcompare_uni;
  3656. + }
  3657. +
  3658. have_read_stdin = false;
  3659. inittables ();
  3660. @@ -4513,13 +5180,34 @@ main (int argc, char **argv)
  3661. case 't':
  3662. {
  3663. - char newtab = optarg[0];
  3664. - if (! newtab)
  3665. + char newtab[MB_LEN_MAX + 1];
  3666. + size_t newtab_length = 1;
  3667. + strncpy (newtab, optarg, MB_LEN_MAX);
  3668. + if (! newtab[0])
  3669. die (SORT_FAILURE, 0, _("empty tab"));
  3670. - if (optarg[1])
  3671. +#if HAVE_MBRTOWC
  3672. + if (MB_CUR_MAX > 1)
  3673. + {
  3674. + wchar_t wc;
  3675. + mbstate_t state;
  3676. +
  3677. + memset (&state, '\0', sizeof (mbstate_t));
  3678. + newtab_length = mbrtowc (&wc, newtab, strnlen (newtab,
  3679. + MB_LEN_MAX),
  3680. + &state);
  3681. + switch (newtab_length)
  3682. + {
  3683. + case (size_t) -1:
  3684. + case (size_t) -2:
  3685. + case 0:
  3686. + newtab_length = 1;
  3687. + }
  3688. + }
  3689. +#endif
  3690. + if (newtab_length == 1 && optarg[1])
  3691. {
  3692. if (STREQ (optarg, "\\0"))
  3693. - newtab = '\0';
  3694. + newtab[0] = '\0';
  3695. else
  3696. {
  3697. /* Provoke with 'sort -txx'. Complain about
  3698. @@ -4530,9 +5218,11 @@ main (int argc, char **argv)
  3699. quote (optarg));
  3700. }
  3701. }
  3702. - if (tab != TAB_DEFAULT && tab != newtab)
  3703. + if (tab_length && (tab_length != newtab_length
  3704. + || memcmp (tab, newtab, tab_length) != 0))
  3705. die (SORT_FAILURE, 0, _("incompatible tabs"));
  3706. - tab = newtab;
  3707. + memcpy (tab, newtab, newtab_length);
  3708. + tab_length = newtab_length;
  3709. }
  3710. break;
  3711. @@ -4770,12 +5460,10 @@ main (int argc, char **argv)
  3712. sort (files, nfiles, outfile, nthreads);
  3713. }
  3714. -#ifdef lint
  3715. if (files_from)
  3716. readtokens0_free (&tok);
  3717. else
  3718. free (files);
  3719. -#endif
  3720. if (have_read_stdin && fclose (stdin) == EOF)
  3721. sort_die (_("close failed"), "-");
  3722. diff -Naurp coreutils-8.27-orig/src/unexpand.c coreutils-8.27/src/unexpand.c
  3723. --- coreutils-8.27-orig/src/unexpand.c 2017-01-01 16:34:24.000000000 -0600
  3724. +++ coreutils-8.27/src/unexpand.c 2017-03-11 23:49:06.758133530 -0600
  3725. @@ -38,6 +38,9 @@
  3726. #include <stdio.h>
  3727. #include <getopt.h>
  3728. #include <sys/types.h>
  3729. +
  3730. +#include <mbfile.h>
  3731. +
  3732. #include "system.h"
  3733. #include "die.h"
  3734. #include "xstrndup.h"
  3735. @@ -107,24 +110,47 @@ unexpand (void)
  3736. {
  3737. /* Input stream. */
  3738. FILE *fp = next_file (NULL);
  3739. + mb_file_t mbf;
  3740. /* The array of pending blanks. In non-POSIX locales, blanks can
  3741. include characters other than spaces, so the blanks must be
  3742. stored, not merely counted. */
  3743. - char *pending_blank;
  3744. + mbf_char_t *pending_blank;
  3745. + /* True if the starting locale is utf8. */
  3746. + bool using_utf_locale;
  3747. +
  3748. + /* True if the first file contains BOM header. */
  3749. + bool found_bom;
  3750. + using_utf_locale=check_utf_locale();
  3751. if (!fp)
  3752. return;
  3753. + mbf_init (mbf, fp);
  3754. + found_bom=check_bom(fp,&mbf);
  3755. + if (using_utf_locale == false && found_bom == true)
  3756. + {
  3757. + /*try using some predefined locale */
  3758. +
  3759. + if (set_utf_locale () != 0)
  3760. + {
  3761. + error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale"));
  3762. + }
  3763. + }
  3764. /* The worst case is a non-blank character, then one blank, then a
  3765. tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so
  3766. allocate MAX_COLUMN_WIDTH bytes to store the blanks. */
  3767. - pending_blank = xmalloc (max_column_width);
  3768. + pending_blank = xmalloc (max_column_width * sizeof (mbf_char_t));
  3769. +
  3770. + if (found_bom == true)
  3771. + {
  3772. + print_bom();
  3773. + }
  3774. while (true)
  3775. {
  3776. /* Input character, or EOF. */
  3777. - int c;
  3778. + mbf_char_t c;
  3779. /* If true, perform translations. */
  3780. bool convert = true;
  3781. @@ -158,12 +184,44 @@ unexpand (void)
  3782. do
  3783. {
  3784. - while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
  3785. - continue;
  3786. + while (true) {
  3787. + mbf_getc (c, mbf);
  3788. + if ((mb_iseof (c)) && (fp = next_file (fp)))
  3789. + {
  3790. + mbf_init (mbf, fp);
  3791. + if (fp!=NULL)
  3792. + {
  3793. + if (check_bom(fp,&mbf)==true)
  3794. + {
  3795. + /*Not the first file - check BOM header*/
  3796. + if (using_utf_locale==false && found_bom==false)
  3797. + {
  3798. + /*BOM header in subsequent file but not in the first one. */
  3799. + error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
  3800. + }
  3801. + }
  3802. + else
  3803. + {
  3804. + if(using_utf_locale==false && found_bom==true)
  3805. + {
  3806. + /*First file conatined BOM header - locale was switched to UTF
  3807. + /*all subsequent files should contain BOM. */
  3808. + error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
  3809. + }
  3810. + }
  3811. + }
  3812. + continue;
  3813. + }
  3814. + else
  3815. + {
  3816. + break;
  3817. + }
  3818. + }
  3819. +
  3820. if (convert)
  3821. {
  3822. - bool blank = !! isblank (c);
  3823. + bool blank = mb_isblank (c);
  3824. if (blank)
  3825. {
  3826. @@ -180,16 +238,16 @@ unexpand (void)
  3827. if (next_tab_column < column)
  3828. die (EXIT_FAILURE, 0, _("input line is too long"));
  3829. - if (c == '\t')
  3830. + if (mb_iseq (c, '\t'))
  3831. {
  3832. column = next_tab_column;
  3833. if (pending)
  3834. - pending_blank[0] = '\t';
  3835. + mb_setascii (&pending_blank[0], '\t');
  3836. }
  3837. else
  3838. {
  3839. - column++;
  3840. + column += mb_width (c);
  3841. if (! (prev_blank && column == next_tab_column))
  3842. {
  3843. @@ -197,13 +255,14 @@ unexpand (void)
  3844. will be replaced by tabs. */
  3845. if (column == next_tab_column)
  3846. one_blank_before_tab_stop = true;
  3847. - pending_blank[pending++] = c;
  3848. + mb_copy (&pending_blank[pending++], &c);
  3849. prev_blank = true;
  3850. continue;
  3851. }
  3852. /* Replace the pending blanks by a tab or two. */
  3853. - pending_blank[0] = c = '\t';
  3854. + mb_setascii (&c, '\t');
  3855. + mb_setascii (&pending_blank[0], '\t');
  3856. }
  3857. /* Discard pending blanks, unless it was a single
  3858. @@ -211,7 +270,7 @@ unexpand (void)
  3859. pending = one_blank_before_tab_stop;
  3860. }
  3861. }
  3862. - else if (c == '\b')
  3863. + else if (mb_iseq (c, '\b'))
  3864. {
  3865. /* Go back one column, and force recalculation of the
  3866. next tab stop. */
  3867. @@ -219,9 +278,9 @@ unexpand (void)
  3868. next_tab_column = column;
  3869. tab_index -= !!tab_index;
  3870. }
  3871. - else
  3872. + else if (!mb_iseq (c, '\n'))
  3873. {
  3874. - column++;
  3875. + column += mb_width (c);
  3876. if (!column)
  3877. die (EXIT_FAILURE, 0, _("input line is too long"));
  3878. }
  3879. @@ -229,8 +288,11 @@ unexpand (void)
  3880. if (pending)
  3881. {
  3882. if (pending > 1 && one_blank_before_tab_stop)
  3883. - pending_blank[0] = '\t';
  3884. - if (fwrite (pending_blank, 1, pending, stdout) != pending)
  3885. + mb_setascii (&pending_blank[0], '\t');
  3886. +
  3887. + for (int n = 0; n < pending; ++n)
  3888. + mb_putc (pending_blank[n], stdout);
  3889. + if (ferror (stdout))
  3890. die (EXIT_FAILURE, errno, _("write error"));
  3891. pending = 0;
  3892. one_blank_before_tab_stop = false;
  3893. @@ -240,16 +302,17 @@ unexpand (void)
  3894. convert &= convert_entire_line || blank;
  3895. }
  3896. - if (c < 0)
  3897. + if (mb_iseof (c))
  3898. {
  3899. free (pending_blank);
  3900. return;
  3901. }
  3902. - if (putchar (c) < 0)
  3903. + mb_putc (c, stdout);
  3904. + if (ferror (stdout))
  3905. die (EXIT_FAILURE, errno, _("write error"));
  3906. }
  3907. - while (c != '\n');
  3908. + while (!mb_iseq (c, '\n'));
  3909. }
  3910. }
  3911. diff -Naurp coreutils-8.27-orig/src/uniq.c coreutils-8.27/src/uniq.c
  3912. --- coreutils-8.27-orig/src/uniq.c 2017-01-01 16:34:24.000000000 -0600
  3913. +++ coreutils-8.27/src/uniq.c 2017-03-11 23:47:13.098285938 -0600
  3914. @@ -21,6 +21,17 @@
  3915. #include <getopt.h>
  3916. #include <sys/types.h>
  3917. +/* Get mbstate_t, mbrtowc(). */
  3918. +#if HAVE_WCHAR_H
  3919. +# include <wchar.h>
  3920. +#endif
  3921. +
  3922. +/* Get isw* functions. */
  3923. +#if HAVE_WCTYPE_H
  3924. +# include <wctype.h>
  3925. +#endif
  3926. +#include <assert.h>
  3927. +
  3928. #include "system.h"
  3929. #include "argmatch.h"
  3930. #include "linebuffer.h"
  3931. @@ -32,9 +43,21 @@
  3932. #include "stdio--.h"
  3933. #include "xmemcoll.h"
  3934. #include "xstrtol.h"
  3935. -#include "memcasecmp.h"
  3936. +#include "xmemcoll.h"
  3937. #include "quote.h"
  3938. +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
  3939. + installation; work around this configuration error. */
  3940. +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
  3941. +# define MB_LEN_MAX 16
  3942. +#endif
  3943. +
  3944. +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
  3945. +#if HAVE_MBRTOWC && defined mbstate_t
  3946. +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
  3947. +#endif
  3948. +
  3949. +
  3950. /* The official name of this program (e.g., no 'g' prefix). */
  3951. #define PROGRAM_NAME "uniq"
  3952. @@ -144,6 +167,10 @@ enum
  3953. GROUP_OPTION = CHAR_MAX + 1
  3954. };
  3955. +/* Function pointers. */
  3956. +static char *
  3957. +(*find_field) (struct linebuffer *line);
  3958. +
  3959. static struct option const longopts[] =
  3960. {
  3961. {"count", no_argument, NULL, 'c'},
  3962. @@ -260,7 +287,7 @@ size_opt (char const *opt, char const *m
  3963. return a pointer to the beginning of the line's field to be compared. */
  3964. static char * _GL_ATTRIBUTE_PURE
  3965. -find_field (struct linebuffer const *line)
  3966. +find_field_uni (struct linebuffer *line)
  3967. {
  3968. size_t count;
  3969. char const *lp = line->buffer;
  3970. @@ -280,6 +307,83 @@ find_field (struct linebuffer const *lin
  3971. return line->buffer + i;
  3972. }
  3973. +#if HAVE_MBRTOWC
  3974. +
  3975. +# define MBCHAR_TO_WCHAR(WC, MBLENGTH, LP, POS, SIZE, STATEP, CONVFAIL) \
  3976. + do \
  3977. + { \
  3978. + mbstate_t state_bak; \
  3979. + \
  3980. + CONVFAIL = 0; \
  3981. + state_bak = *STATEP; \
  3982. + \
  3983. + MBLENGTH = mbrtowc (&WC, LP + POS, SIZE - POS, STATEP); \
  3984. + \
  3985. + switch (MBLENGTH) \
  3986. + { \
  3987. + case (size_t)-2: \
  3988. + case (size_t)-1: \
  3989. + *STATEP = state_bak; \
  3990. + CONVFAIL++; \
  3991. + /* Fall through */ \
  3992. + case 0: \
  3993. + MBLENGTH = 1; \
  3994. + } \
  3995. + } \
  3996. + while (0)
  3997. +
  3998. +static char *
  3999. +find_field_multi (struct linebuffer *line)
  4000. +{
  4001. + size_t count;
  4002. + char *lp = line->buffer;
  4003. + size_t size = line->length - 1;
  4004. + size_t pos;
  4005. + size_t mblength;
  4006. + wchar_t wc;
  4007. + mbstate_t *statep;
  4008. + int convfail = 0;
  4009. +
  4010. + pos = 0;
  4011. + statep = &(line->state);
  4012. +
  4013. + /* skip fields. */
  4014. + for (count = 0; count < skip_fields && pos < size; count++)
  4015. + {
  4016. + while (pos < size)
  4017. + {
  4018. + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
  4019. +
  4020. + if (convfail || !(iswblank (wc) || wc == '\n'))
  4021. + {
  4022. + pos += mblength;
  4023. + break;
  4024. + }
  4025. + pos += mblength;
  4026. + }
  4027. +
  4028. + while (pos < size)
  4029. + {
  4030. + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
  4031. +
  4032. + if (!convfail && (iswblank (wc) || wc == '\n'))
  4033. + break;
  4034. +
  4035. + pos += mblength;
  4036. + }
  4037. + }
  4038. +
  4039. + /* skip fields. */
  4040. + for (count = 0; count < skip_chars && pos < size; count++)
  4041. + {
  4042. + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
  4043. + pos += mblength;
  4044. + }
  4045. +
  4046. + return lp + pos;
  4047. +}
  4048. +#endif
  4049. +
  4050. /* Return false if two strings OLD and NEW match, true if not.
  4051. OLD and NEW point not to the beginnings of the lines
  4052. but rather to the beginnings of the fields to compare.
  4053. @@ -288,6 +392,8 @@ find_field (struct linebuffer const *lin
  4054. static bool
  4055. different (char *old, char *new, size_t oldlen, size_t newlen)
  4056. {
  4057. + char *copy_old, *copy_new;
  4058. +
  4059. if (check_chars < oldlen)
  4060. oldlen = check_chars;
  4061. if (check_chars < newlen)
  4062. @@ -295,14 +401,103 @@ different (char *old, char *new, size_t
  4063. if (ignore_case)
  4064. {
  4065. - /* FIXME: This should invoke strcoll somehow. */
  4066. - return oldlen != newlen || memcasecmp (old, new, oldlen);
  4067. + size_t i;
  4068. +
  4069. + copy_old = xmalloc (oldlen + 1);
  4070. + copy_new = xmalloc (oldlen + 1);
  4071. +
  4072. + for (i = 0; i < oldlen; i++)
  4073. + {
  4074. + copy_old[i] = toupper (old[i]);
  4075. + copy_new[i] = toupper (new[i]);
  4076. + }
  4077. + bool rc = xmemcoll (copy_old, oldlen, copy_new, newlen);
  4078. + free (copy_old);
  4079. + free (copy_new);
  4080. + return rc;
  4081. }
  4082. - else if (hard_LC_COLLATE)
  4083. - return xmemcoll (old, oldlen, new, newlen) != 0;
  4084. else
  4085. - return oldlen != newlen || memcmp (old, new, oldlen);
  4086. + {
  4087. + copy_old = (char *)old;
  4088. + copy_new = (char *)new;
  4089. + }
  4090. +
  4091. + return xmemcoll (copy_old, oldlen, copy_new, newlen);
  4092. +
  4093. +}
  4094. +
  4095. +#if HAVE_MBRTOWC
  4096. +static int
  4097. +different_multi (const char *old, const char *new, size_t oldlen, size_t newlen, mbstate_t oldstate, mbstate_t newstate)
  4098. +{
  4099. + size_t i, j, chars;
  4100. + const char *str[2];
  4101. + char *copy[2];
  4102. + size_t len[2];
  4103. + mbstate_t state[2];
  4104. + size_t mblength;
  4105. + wchar_t wc, uwc;
  4106. + mbstate_t state_bak;
  4107. +
  4108. + str[0] = old;
  4109. + str[1] = new;
  4110. + len[0] = oldlen;
  4111. + len[1] = newlen;
  4112. + state[0] = oldstate;
  4113. + state[1] = newstate;
  4114. +
  4115. + for (i = 0; i < 2; i++)
  4116. + {
  4117. + copy[i] = xmalloc (len[i] + 1);
  4118. + memset (copy[i], '\0', len[i] + 1);
  4119. +
  4120. + for (j = 0, chars = 0; j < len[i] && chars < check_chars; chars++)
  4121. + {
  4122. + state_bak = state[i];
  4123. + mblength = mbrtowc (&wc, str[i] + j, len[i] - j, &(state[i]));
  4124. +
  4125. + switch (mblength)
  4126. + {
  4127. + case (size_t)-1:
  4128. + case (size_t)-2:
  4129. + state[i] = state_bak;
  4130. + /* Fall through */
  4131. + case 0:
  4132. + mblength = 1;
  4133. + break;
  4134. +
  4135. + default:
  4136. + if (ignore_case)
  4137. + {
  4138. + uwc = towupper (wc);
  4139. +
  4140. + if (uwc != wc)
  4141. + {
  4142. + mbstate_t state_wc;
  4143. + size_t mblen;
  4144. +
  4145. + memset (&state_wc, '\0', sizeof(mbstate_t));
  4146. + mblen = wcrtomb (copy[i] + j, uwc, &state_wc);
  4147. + assert (mblen != (size_t)-1);
  4148. + }
  4149. + else
  4150. + memcpy (copy[i] + j, str[i] + j, mblength);
  4151. + }
  4152. + else
  4153. + memcpy (copy[i] + j, str[i] + j, mblength);
  4154. + }
  4155. + j += mblength;
  4156. + }
  4157. + copy[i][j] = '\0';
  4158. + len[i] = j;
  4159. + }
  4160. + int rc = xmemcoll (copy[0], len[0], copy[1], len[1]);
  4161. + free (copy[0]);
  4162. + free (copy[1]);
  4163. + return rc;
  4164. +
  4165. }
  4166. +#endif
  4167. /* Output the line in linebuffer LINE to standard output
  4168. provided that the switches say it should be output.
  4169. @@ -367,19 +562,38 @@ check_file (const char *infile, const ch
  4170. char *prevfield IF_LINT ( = NULL);
  4171. size_t prevlen IF_LINT ( = 0);
  4172. bool first_group_printed = false;
  4173. +#if HAVE_MBRTOWC
  4174. + mbstate_t prevstate;
  4175. +
  4176. + memset (&prevstate, '\0', sizeof (mbstate_t));
  4177. +#endif
  4178. while (!feof (stdin))
  4179. {
  4180. char *thisfield;
  4181. size_t thislen;
  4182. bool new_group;
  4183. +#if HAVE_MBRTOWC
  4184. + mbstate_t thisstate;
  4185. +#endif
  4186. if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
  4187. break;
  4188. thisfield = find_field (thisline);
  4189. thislen = thisline->length - 1 - (thisfield - thisline->buffer);
  4190. +#if HAVE_MBRTOWC
  4191. + if (MB_CUR_MAX > 1)
  4192. + {
  4193. + thisstate = thisline->state;
  4194. + new_group = (prevline->length == 0
  4195. + || different_multi (thisfield, prevfield,
  4196. + thislen, prevlen,
  4197. + thisstate, prevstate));
  4198. + }
  4199. + else
  4200. +#endif
  4201. new_group = (prevline->length == 0
  4202. || different (thisfield, prevfield, thislen, prevlen));
  4203. @@ -397,6 +611,10 @@ check_file (const char *infile, const ch
  4204. SWAP_LINES (prevline, thisline);
  4205. prevfield = thisfield;
  4206. prevlen = thislen;
  4207. +#if HAVE_MBRTOWC
  4208. + if (MB_CUR_MAX > 1)
  4209. + prevstate = thisstate;
  4210. +#endif
  4211. first_group_printed = true;
  4212. }
  4213. }
  4214. @@ -409,17 +627,26 @@ check_file (const char *infile, const ch
  4215. size_t prevlen;
  4216. uintmax_t match_count = 0;
  4217. bool first_delimiter = true;
  4218. +#if HAVE_MBRTOWC
  4219. + mbstate_t prevstate;
  4220. +#endif
  4221. if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
  4222. goto closefiles;
  4223. prevfield = find_field (prevline);
  4224. prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
  4225. +#if HAVE_MBRTOWC
  4226. + prevstate = prevline->state;
  4227. +#endif
  4228. while (!feof (stdin))
  4229. {
  4230. bool match;
  4231. char *thisfield;
  4232. size_t thislen;
  4233. +#if HAVE_MBRTOWC
  4234. + mbstate_t thisstate = thisline->state;
  4235. +#endif
  4236. if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
  4237. {
  4238. if (ferror (stdin))
  4239. @@ -428,6 +655,14 @@ check_file (const char *infile, const ch
  4240. }
  4241. thisfield = find_field (thisline);
  4242. thislen = thisline->length - 1 - (thisfield - thisline->buffer);
  4243. +#if HAVE_MBRTOWC
  4244. + if (MB_CUR_MAX > 1)
  4245. + {
  4246. + match = !different_multi (thisfield, prevfield,
  4247. + thislen, prevlen, thisstate, prevstate);
  4248. + }
  4249. + else
  4250. +#endif
  4251. match = !different (thisfield, prevfield, thislen, prevlen);
  4252. match_count += match;
  4253. @@ -460,6 +695,9 @@ check_file (const char *infile, const ch
  4254. SWAP_LINES (prevline, thisline);
  4255. prevfield = thisfield;
  4256. prevlen = thislen;
  4257. +#if HAVE_MBRTOWC
  4258. + prevstate = thisstate;
  4259. +#endif
  4260. if (!match)
  4261. match_count = 0;
  4262. }
  4263. @@ -506,6 +744,19 @@ main (int argc, char **argv)
  4264. atexit (close_stdout);
  4265. +#if HAVE_MBRTOWC
  4266. + if (MB_CUR_MAX > 1)
  4267. + {
  4268. + find_field = find_field_multi;
  4269. + }
  4270. + else
  4271. +#endif
  4272. + {
  4273. + find_field = find_field_uni;
  4274. + }
  4275. +
  4276. +
  4277. +
  4278. skip_chars = 0;
  4279. skip_fields = 0;
  4280. check_chars = SIZE_MAX;
  4281. diff -Naurp coreutils-8.27-orig/tests/expand/mb.sh coreutils-8.27/tests/expand/mb.sh
  4282. --- coreutils-8.27-orig/tests/expand/mb.sh 1969-12-31 18:00:00.000000000 -0600
  4283. +++ coreutils-8.27/tests/expand/mb.sh 2017-03-11 23:49:06.759133489 -0600
  4284. @@ -0,0 +1,183 @@
  4285. +#!/bin/sh
  4286. +
  4287. +# Copyright (C) 2012-2017 Free Software Foundation, Inc.
  4288. +
  4289. +# This program is free software: you can redistribute it and/or modify
  4290. +# it under the terms of the GNU General Public License as published by
  4291. +# the Free Software Foundation, either version 3 of the License, or
  4292. +# (at your option) any later version.
  4293. +
  4294. +# This program is distributed in the hope that it will be useful,
  4295. +# but WITHOUT ANY WARRANTY; without even the implied warranty of
  4296. +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  4297. +# GNU General Public License for more details.
  4298. +
  4299. +# You should have received a copy of the GNU General Public License
  4300. +# along with this program. If not, see <http://www.gnu.org/licenses/>.
  4301. +
  4302. +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
  4303. +print_ver_ expand
  4304. +
  4305. +export LC_ALL=en_US.UTF-8
  4306. +
  4307. +#input containing multibyte characters
  4308. +cat <<\EOF > in || framework_failure_
  4309. +1234567812345678123456781
  4310. +. . . .
  4311. +a b c d
  4312. +. . . .
  4313. +ä ö ü ß
  4314. +. . . .
  4315. +EOF
  4316. +env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_
  4317. +
  4318. +cat <<\EOF > exp || framework_failure_
  4319. +1234567812345678123456781
  4320. +. . . .
  4321. +a b c d
  4322. +. . . .
  4323. +ä ö ü ß
  4324. +. . . .
  4325. + äöü . öüä. ä xx
  4326. +EOF
  4327. +
  4328. +expand < in > out || fail=1
  4329. +compare exp out > /dev/null 2>&1 || fail=1
  4330. +
  4331. +#multiple files as an input
  4332. +cat <<\EOF >> exp || framework_failure_
  4333. +1234567812345678123456781
  4334. +. . . .
  4335. +a b c d
  4336. +. . . .
  4337. +ä ö ü ß
  4338. +. . . .
  4339. + äöü . öüä. ä xx
  4340. +EOF
  4341. +
  4342. +expand ./in ./in > out || fail=1
  4343. +compare exp out > /dev/null 2>&1 || fail=1
  4344. +
  4345. +#test characters with display widths != 1
  4346. +env printf '12345678
  4347. +e\t|ascii(1)
  4348. +\u00E9\t|composed(1)
  4349. +e\u0301\t|decomposed(1)
  4350. +\u3000\t|ideo-space(2)
  4351. +\uFF0D\t|full-hypen(2)
  4352. +' > in || framework_failure_
  4353. +
  4354. +env printf '12345678
  4355. +e |ascii(1)
  4356. +\u00E9 |composed(1)
  4357. +e\u0301 |decomposed(1)
  4358. +\u3000 |ideo-space(2)
  4359. +\uFF0D |full-hypen(2)
  4360. +' > exp || framework_failure_
  4361. +
  4362. +expand < in > out || fail=1
  4363. +compare exp out > /dev/null 2>&1 || fail=1
  4364. +
  4365. +#shouldn't fail with "input line too long"
  4366. +#when a line starts with a control character
  4367. +env printf '\n' > in || framework_failure_
  4368. +
  4369. +expand < in > out || fail=1
  4370. +compare in out > /dev/null 2>&1 || fail=1
  4371. +
  4372. +#non-Unicode characters interspersed between Unicode ones
  4373. +env printf '12345678
  4374. +\t\xFF|
  4375. +\xFF\t|
  4376. +\t\xFFä|
  4377. +ä\xFF\t|
  4378. +\tä\xFF|
  4379. +\xFF\tä|
  4380. +äbcdef\xFF\t|
  4381. +' > in || framework_failure_
  4382. +
  4383. +env printf '12345678
  4384. + \xFF|
  4385. +\xFF |
  4386. + \xFFä|
  4387. +ä\xFF |
  4388. + ä\xFF|
  4389. +\xFF ä|
  4390. +äbcdef\xFF |
  4391. +' > exp || framework_failure_
  4392. +
  4393. +expand < in > out || fail=1
  4394. +compare exp out > /dev/null 2>&1 || fail=1
  4395. +
  4396. +
  4397. +
  4398. +#BOM header test 1
  4399. +printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_
  4400. +1234567812345678123456781
  4401. +. . . .
  4402. +a b c d
  4403. +. . . .
  4404. +ä ö ü ß
  4405. +. . . .
  4406. +EOF
  4407. +env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_
  4408. +
  4409. +printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
  4410. +1234567812345678123456781
  4411. +. . . .
  4412. +a b c d
  4413. +. . . .
  4414. +ä ö ü ß
  4415. +. . . .
  4416. + äöü . öüä. ä xx
  4417. +EOF
  4418. +
  4419. +
  4420. +expand < in > out || fail=1
  4421. +compare exp out > /dev/null 2>&1 || fail=1
  4422. +
  4423. +LANG=C expand < in > out || fail=1
  4424. +compare exp out > /dev/null 2>&1 || fail=1
  4425. +
  4426. +LC_ALL=C expand < in > out || fail=1
  4427. +compare exp out > /dev/null 2>&1 || fail=1
  4428. +
  4429. +
  4430. +printf '\xEF\xBB\xBF' > in1; cat <<\EOF >> in1 || framework_failure_
  4431. +1234567812345678123456781
  4432. +. . . .
  4433. +a b c d
  4434. +. . . .
  4435. +ä ö ü ß
  4436. +. . . .
  4437. +EOF
  4438. +env printf ' äöü\t. öüä. \tä xx\n' >> in1 || framework_failure_
  4439. +
  4440. +
  4441. +printf '\xEF\xBB\xBF' > exp; cat <<\EOF >> exp || framework_failure_
  4442. +1234567812345678123456781
  4443. +. . . .
  4444. +a b c d
  4445. +. . . .
  4446. +ä ö ü ß
  4447. +. . . .
  4448. + äöü . öüä. ä xx
  4449. +1234567812345678123456781
  4450. +. . . .
  4451. +a b c d
  4452. +. . . .
  4453. +ä ö ü ß
  4454. +. . . .
  4455. + äöü . öüä. ä xx
  4456. +EOF
  4457. +
  4458. +expand in1 in1 > out || fail=1
  4459. +compare exp out > /dev/null 2>&1 || fail=1
  4460. +
  4461. +LANG=C expand in1 in1 > out || fail=1
  4462. +compare exp out > /dev/null 2>&1 || fail=1
  4463. +
  4464. +LC_ALL=C expand in1 in1 > out || fail=1
  4465. +compare exp out > /dev/null 2>&1 || fail=1
  4466. +
  4467. +exit $fail
  4468. diff -Naurp coreutils-8.27-orig/tests/i18n/sort.sh coreutils-8.27/tests/i18n/sort.sh
  4469. --- coreutils-8.27-orig/tests/i18n/sort.sh 1969-12-31 18:00:00.000000000 -0600
  4470. +++ coreutils-8.27/tests/i18n/sort.sh 2017-03-11 23:47:13.100285838 -0600
  4471. @@ -0,0 +1,29 @@
  4472. +#!/bin/sh
  4473. +# Verify sort's multi-byte support.
  4474. +
  4475. +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
  4476. +print_ver_ sort
  4477. +
  4478. +export LC_ALL=en_US.UTF-8
  4479. +locale -k LC_CTYPE | grep -q "charmap.*UTF-8" \
  4480. + || skip_ "No UTF-8 locale available"
  4481. +
  4482. +# Enable heap consistency checkng on older systems
  4483. +export MALLOC_CHECK_=2
  4484. +
  4485. +
  4486. +# check buffer overflow issue due to
  4487. +# expanding multi-byte representation due to case conversion
  4488. +# https://bugzilla.suse.com/show_bug.cgi?id=928749
  4489. +cat <<EOF > exp
  4490. +.
  4491. +EOF
  4492. +cat <<EOF | sort -f > out || fail=1
  4493. +.
  4494. +EOF
  4495. +compare exp out || { fail=1; cat out; }
  4496. +
  4497. +
  4498. +Exit $fail
  4499. diff -Naurp coreutils-8.27-orig/tests/local.mk coreutils-8.27/tests/local.mk
  4500. --- coreutils-8.27-orig/tests/local.mk 2017-02-28 22:25:37.000000000 -0600
  4501. +++ coreutils-8.27/tests/local.mk 2017-03-11 23:47:38.072058253 -0600
  4502. @@ -352,6 +352,8 @@ all_tests = \
  4503. tests/misc/sort-discrim.sh \
  4504. tests/misc/sort-files0-from.pl \
  4505. tests/misc/sort-float.sh \
  4506. + tests/misc/sort-mb-tests.sh \
  4507. + tests/i18n/sort.sh \
  4508. tests/misc/sort-h-thousands-sep.sh \
  4509. tests/misc/sort-merge.pl \
  4510. tests/misc/sort-merge-fdlimit.sh \
  4511. @@ -544,6 +546,7 @@ all_tests = \
  4512. tests/du/threshold.sh \
  4513. tests/du/trailing-slash.sh \
  4514. tests/du/two-args.sh \
  4515. + tests/expand/mb.sh \
  4516. tests/id/gnu-zero-uids.sh \
  4517. tests/id/no-context.sh \
  4518. tests/id/context.sh \
  4519. @@ -684,6 +687,7 @@ all_tests = \
  4520. tests/touch/read-only.sh \
  4521. tests/touch/relative.sh \
  4522. tests/touch/trailing-slash.sh \
  4523. + tests/unexpand/mb.sh \
  4524. $(all_root_tests)
  4525. # See tests/factor/create-test.sh.
  4526. diff -Naurp coreutils-8.27-orig/tests/misc/cut.pl coreutils-8.27/tests/misc/cut.pl
  4527. --- coreutils-8.27-orig/tests/misc/cut.pl 2017-01-01 16:34:24.000000000 -0600
  4528. +++ coreutils-8.27/tests/misc/cut.pl 2017-03-11 23:47:13.100285838 -0600
  4529. @@ -23,9 +23,11 @@ use strict;
  4530. # Turn off localization of executable's output.
  4531. @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
  4532. -my $mb_locale = $ENV{LOCALE_FR_UTF8};
  4533. +my $mb_locale;
  4534. +# uncommented enable multibyte paths
  4535. +$mb_locale = $ENV{LOCALE_FR_UTF8};
  4536. ! defined $mb_locale || $mb_locale eq 'none'
  4537. - and $mb_locale = 'C';
  4538. + and $mb_locale = 'C';
  4539. my $prog = 'cut';
  4540. my $try = "Try '$prog --help' for more information.\n";
  4541. @@ -240,6 +242,7 @@ if ($mb_locale ne 'C')
  4542. my @new_t = @$t;
  4543. my $test_name = shift @new_t;
  4544. + next if ($test_name =~ "newline-[12][0-9]");
  4545. push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
  4546. }
  4547. push @Tests, @new;
  4548. diff -Naurp coreutils-8.27-orig/tests/misc/expand.pl coreutils-8.27/tests/misc/expand.pl
  4549. --- coreutils-8.27-orig/tests/misc/expand.pl 2017-03-01 11:16:46.000000000 -0600
  4550. +++ coreutils-8.27/tests/misc/expand.pl 2017-03-11 23:47:13.101285788 -0600
  4551. @@ -27,6 +27,15 @@ my $prog = 'expand';
  4552. # Turn off localization of executable's output.
  4553. @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
  4554. +#comment out next line to disable multibyte tests
  4555. +my $mb_locale = $ENV{LOCALE_FR_UTF8};
  4556. +! defined $mb_locale || $mb_locale eq 'none'
  4557. + and $mb_locale = 'C';
  4558. +
  4559. +my $prog = 'expand';
  4560. +my $try = "Try \`$prog --help' for more information.\n";
  4561. +my $inval = "$prog: invalid byte, character or field list\n$try";
  4562. +
  4563. my @Tests =
  4564. (
  4565. ['t1', '--tabs=3', {IN=>"a\tb"}, {OUT=>"a b"}],
  4566. @@ -152,6 +161,8 @@ my @Tests =
  4567. ['trail9', '--tab=1,2 -t/5',{IN=>"\ta\tb\tc"}, {OUT=>" a b c"}],
  4568. # Test errors
  4569. + # FIXME: The following tests contain ‘quoting’ specific to LC_MESSAGES
  4570. + # So we force LC_MESSAGES=C to make them pass.
  4571. ['e1', '--tabs="a"', {IN=>''}, {OUT=>''}, {EXIT=>1},
  4572. {ERR => "$prog: tab size contains invalid character(s): 'a'\n"}],
  4573. ['e2', "-t $UINTMAX_OFLOW", {IN=>''}, {OUT=>''}, {EXIT=>1},
  4574. @@ -168,6 +179,37 @@ my @Tests =
  4575. {ERR => "$prog: '/' specifier not at start of number: '/'\n"}],
  4576. );
  4577. +if ($mb_locale ne 'C')
  4578. + {
  4579. + # Duplicate each test vector, appending "-mb" to the test name and
  4580. + # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
  4581. + # provide coverage for the distro-added multi-byte code paths.
  4582. + my @new;
  4583. + foreach my $t (@Tests)
  4584. + {
  4585. + my @new_t = @$t;
  4586. + my $test_name = shift @new_t;
  4587. +
  4588. + # Depending on whether expand is multi-byte-patched,
  4589. + # it emits different diagnostics:
  4590. + # non-MB: invalid byte or field list
  4591. + # MB: invalid byte, character or field list
  4592. + # Adjust the expected error output accordingly.
  4593. + if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
  4594. + (@new_t))
  4595. + {
  4596. + my $sub = {ERR_SUBST => 's/, character//'};
  4597. + push @new_t, $sub;
  4598. + push @$t, $sub;
  4599. + }
  4600. + push @new, ["$test_name-mb", @new_t, {ENV => "LANG=$mb_locale LC_MESSAGES=C"}];
  4601. + }
  4602. + push @Tests, @new;
  4603. + }
  4604. +
  4605. +
  4606. +@Tests = triple_test \@Tests;
  4607. +
  4608. my $save_temps = $ENV{DEBUG};
  4609. my $verbose = $ENV{VERBOSE};
  4610. diff -Naurp coreutils-8.27-orig/tests/misc/fold.pl coreutils-8.27/tests/misc/fold.pl
  4611. --- coreutils-8.27-orig/tests/misc/fold.pl 2017-01-01 16:34:24.000000000 -0600
  4612. +++ coreutils-8.27/tests/misc/fold.pl 2017-03-11 23:47:13.101285788 -0600
  4613. @@ -20,9 +20,18 @@ use strict;
  4614. (my $program_name = $0) =~ s|.*/||;
  4615. +my $prog = 'fold';
  4616. +my $try = "Try \`$prog --help' for more information.\n";
  4617. +my $inval = "$prog: invalid byte, character or field list\n$try";
  4618. +
  4619. # Turn off localization of executable's output.
  4620. @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
  4621. +# uncommented to enable multibyte paths
  4622. +my $mb_locale = $ENV{LOCALE_FR_UTF8};
  4623. +! defined $mb_locale || $mb_locale eq 'none'
  4624. + and $mb_locale = 'C';
  4625. +
  4626. my @Tests =
  4627. (
  4628. ['s1', '-w2 -s', {IN=>"a\t"}, {OUT=>"a\n\t"}],
  4629. @@ -31,9 +40,48 @@ my @Tests =
  4630. ['s4', '-w4 -s', {IN=>"abc ef\n"}, {OUT=>"abc \nef\n"}],
  4631. );
  4632. +# Add _POSIX2_VERSION=199209 to the environment of each test
  4633. +# that uses an old-style option like +1.
  4634. +if ($mb_locale ne 'C')
  4635. + {
  4636. + # Duplicate each test vector, appending "-mb" to the test name and
  4637. + # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
  4638. + # provide coverage for the distro-added multi-byte code paths.
  4639. + my @new;
  4640. + foreach my $t (@Tests)
  4641. + {
  4642. + my @new_t = @$t;
  4643. + my $test_name = shift @new_t;
  4644. +
  4645. + # Depending on whether fold is multi-byte-patched,
  4646. + # it emits different diagnostics:
  4647. + # non-MB: invalid byte or field list
  4648. + # MB: invalid byte, character or field list
  4649. + # Adjust the expected error output accordingly.
  4650. + if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
  4651. + (@new_t))
  4652. + {
  4653. + my $sub = {ERR_SUBST => 's/, character//'};
  4654. + push @new_t, $sub;
  4655. + push @$t, $sub;
  4656. + }
  4657. + push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
  4658. + }
  4659. + push @Tests, @new;
  4660. + }
  4661. +
  4662. +@Tests = triple_test \@Tests;
  4663. +
  4664. +# Remember that triple_test creates from each test with exactly one "IN"
  4665. +# file two more tests (.p and .r suffix on name) corresponding to reading
  4666. +# input from a file and from a pipe. The pipe-reading test would fail
  4667. +# due to a race condition about 1 in 20 times.
  4668. +# Remove the IN_PIPE version of the "output-is-input" test above.
  4669. +# The others aren't susceptible because they have three inputs each.
  4670. +@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
  4671. +
  4672. my $save_temps = $ENV{DEBUG};
  4673. my $verbose = $ENV{VERBOSE};
  4674. -my $prog = 'fold';
  4675. my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose);
  4676. exit $fail;
  4677. diff -Naurp coreutils-8.27-orig/tests/misc/join.pl coreutils-8.27/tests/misc/join.pl
  4678. --- coreutils-8.27-orig/tests/misc/join.pl 2017-01-01 16:34:24.000000000 -0600
  4679. +++ coreutils-8.27/tests/misc/join.pl 2017-03-11 23:47:13.102285737 -0600
  4680. @@ -25,6 +25,15 @@ my $limits = getlimits ();
  4681. my $prog = 'join';
  4682. +my $try = "Try \`$prog --help' for more information.\n";
  4683. +my $inval = "$prog: invalid byte, character or field list\n$try";
  4684. +
  4685. +my $mb_locale;
  4686. +#Comment out next line to disable multibyte tests
  4687. +$mb_locale = $ENV{LOCALE_FR_UTF8};
  4688. +! defined $mb_locale || $mb_locale eq 'none'
  4689. + and $mb_locale = 'C';
  4690. +
  4691. my $delim = chr 0247;
  4692. sub t_subst ($)
  4693. {
  4694. @@ -329,8 +338,49 @@ foreach my $t (@tv)
  4695. push @Tests, $new_ent;
  4696. }
  4697. +# Add _POSIX2_VERSION=199209 to the environment of each test
  4698. +# that uses an old-style option like +1.
  4699. +if ($mb_locale ne 'C')
  4700. + {
  4701. + # Duplicate each test vector, appending "-mb" to the test name and
  4702. + # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
  4703. + # provide coverage for the distro-added multi-byte code paths.
  4704. + my @new;
  4705. + foreach my $t (@Tests)
  4706. + {
  4707. + my @new_t = @$t;
  4708. + my $test_name = shift @new_t;
  4709. +
  4710. + # Depending on whether join is multi-byte-patched,
  4711. + # it emits different diagnostics:
  4712. + # non-MB: invalid byte or field list
  4713. + # MB: invalid byte, character or field list
  4714. + # Adjust the expected error output accordingly.
  4715. + if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
  4716. + (@new_t))
  4717. + {
  4718. + my $sub = {ERR_SUBST => 's/, character//'};
  4719. + push @new_t, $sub;
  4720. + push @$t, $sub;
  4721. + }
  4722. + #Adjust the output some error messages including test_name for mb
  4723. + if (grep {ref $_ eq 'HASH' && exists $_->{ERR}}
  4724. + (@new_t))
  4725. + {
  4726. + my $sub2 = {ERR_SUBST => "s/$test_name-mb/$test_name/"};
  4727. + push @new_t, $sub2;
  4728. + push @$t, $sub2;
  4729. + }
  4730. + push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
  4731. + }
  4732. + push @Tests, @new;
  4733. + }
  4734. +
  4735. @Tests = triple_test \@Tests;
  4736. +#skip invalid-j-mb test, it is failing because of the format
  4737. +@Tests = grep {$_->[0] ne 'invalid-j-mb'} @Tests;
  4738. +
  4739. my $save_temps = $ENV{DEBUG};
  4740. my $verbose = $ENV{VERBOSE};
  4741. diff -Naurp coreutils-8.27-orig/tests/misc/sort-mb-tests.sh coreutils-8.27/tests/misc/sort-mb-tests.sh
  4742. --- coreutils-8.27-orig/tests/misc/sort-mb-tests.sh 1969-12-31 18:00:00.000000000 -0600
  4743. +++ coreutils-8.27/tests/misc/sort-mb-tests.sh 2017-03-11 23:47:13.102285737 -0600
  4744. @@ -0,0 +1,45 @@
  4745. +#!/bin/sh
  4746. +# Verify sort's multi-byte support.
  4747. +
  4748. +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
  4749. +print_ver_ sort
  4750. +
  4751. +export LC_ALL=en_US.UTF-8
  4752. +locale -k LC_CTYPE | grep -q "charmap.*UTF-8" \
  4753. + || skip_ "No UTF-8 locale available"
  4754. +
  4755. +
  4756. +cat <<EOF > exp
  4757. +Banana@5
  4758. +Apple@10
  4759. +Citrus@20
  4760. +Cherry@30
  4761. +EOF
  4762. +
  4763. +cat <<EOF | sort -t @ -k2 -n > out || fail=1
  4764. +Apple@10
  4765. +Banana@5
  4766. +Citrus@20
  4767. +Cherry@30
  4768. +EOF
  4769. +
  4770. +compare exp out || { fail=1; cat out; }
  4771. +
  4772. +
  4773. +cat <<EOF > exp
  4774. +Citrus@AA20@@5
  4775. +Cherry@AA30@@10
  4776. +Apple@AA10@@20
  4777. +Banana@AA5@@30
  4778. +EOF
  4779. +
  4780. +cat <<EOF | sort -t @ -k4 -n > out || fail=1
  4781. +Apple@AA10@@20
  4782. +Banana@AA5@@30
  4783. +Citrus@AA20@@5
  4784. +Cherry@AA30@@10
  4785. +EOF
  4786. +
  4787. +compare exp out || { fail=1; cat out; }
  4788. +
  4789. +Exit $fail
  4790. diff -Naurp coreutils-8.27-orig/tests/misc/sort-merge.pl coreutils-8.27/tests/misc/sort-merge.pl
  4791. --- coreutils-8.27-orig/tests/misc/sort-merge.pl 2017-01-01 16:34:24.000000000 -0600
  4792. +++ coreutils-8.27/tests/misc/sort-merge.pl 2017-03-11 23:47:13.102285737 -0600
  4793. @@ -26,6 +26,15 @@ my $prog = 'sort';
  4794. # Turn off localization of executable's output.
  4795. @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
  4796. +my $mb_locale;
  4797. +# uncommented according to upstream commit enabling multibyte paths
  4798. +$mb_locale = $ENV{LOCALE_FR_UTF8};
  4799. +! defined $mb_locale || $mb_locale eq 'none'
  4800. + and $mb_locale = 'C';
  4801. +
  4802. +my $try = "Try \`$prog --help' for more information.\n";
  4803. +my $inval = "$prog: invalid byte, character or field list\n$try";
  4804. +
  4805. # three empty files and one that says 'foo'
  4806. my @inputs = (+(map{{IN=> {"empty$_"=> ''}}}1..3), {IN=> {foo=> "foo\n"}});
  4807. @@ -77,6 +86,39 @@ my @Tests =
  4808. {OUT=>$big_input}],
  4809. );
  4810. +# Add _POSIX2_VERSION=199209 to the environment of each test
  4811. +# that uses an old-style option like +1.
  4812. +if ($mb_locale ne 'C')
  4813. + {
  4814. + # Duplicate each test vector, appending "-mb" to the test name and
  4815. + # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
  4816. + # provide coverage for the distro-added multi-byte code paths.
  4817. + my @new;
  4818. + foreach my $t (@Tests)
  4819. + {
  4820. + my @new_t = @$t;
  4821. + my $test_name = shift @new_t;
  4822. +
  4823. + # Depending on whether sort is multi-byte-patched,
  4824. + # it emits different diagnostics:
  4825. + # non-MB: invalid byte or field list
  4826. + # MB: invalid byte, character or field list
  4827. + # Adjust the expected error output accordingly.
  4828. + if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
  4829. + (@new_t))
  4830. + {
  4831. + my $sub = {ERR_SUBST => 's/, character//'};
  4832. + push @new_t, $sub;
  4833. + push @$t, $sub;
  4834. + }
  4835. + next if ($test_name =~ "nmerge-.");
  4836. + push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
  4837. + }
  4838. + push @Tests, @new;
  4839. + }
  4840. +
  4841. +@Tests = triple_test \@Tests;
  4842. +
  4843. my $save_temps = $ENV{DEBUG};
  4844. my $verbose = $ENV{VERBOSE};
  4845. diff -Naurp coreutils-8.27-orig/tests/misc/sort.pl coreutils-8.27/tests/misc/sort.pl
  4846. --- coreutils-8.27-orig/tests/misc/sort.pl 2017-01-21 08:53:43.000000000 -0600
  4847. +++ coreutils-8.27/tests/misc/sort.pl 2017-03-11 23:47:13.103285687 -0600
  4848. @@ -24,10 +24,15 @@ my $prog = 'sort';
  4849. # Turn off localization of executable's output.
  4850. @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
  4851. -my $mb_locale = $ENV{LOCALE_FR_UTF8};
  4852. +my $mb_locale;
  4853. +#Comment out next line to disable multibyte tests
  4854. +$mb_locale = $ENV{LOCALE_FR_UTF8};
  4855. ! defined $mb_locale || $mb_locale eq 'none'
  4856. and $mb_locale = 'C';
  4857. +my $try = "Try \`$prog --help' for more information.\n";
  4858. +my $inval = "$prog: invalid byte, character or field list\n$try";
  4859. +
  4860. # Since each test is run with a file name and with redirected stdin,
  4861. # the name in the diagnostic is either the file name or "-".
  4862. # Normalize each diagnostic to use '-'.
  4863. @@ -423,6 +428,38 @@ foreach my $t (@Tests)
  4864. }
  4865. }
  4866. +if ($mb_locale ne 'C')
  4867. + {
  4868. + # Duplicate each test vector, appending "-mb" to the test name and
  4869. + # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
  4870. + # provide coverage for the distro-added multi-byte code paths.
  4871. + my @new;
  4872. + foreach my $t (@Tests)
  4873. + {
  4874. + my @new_t = @$t;
  4875. + my $test_name = shift @new_t;
  4876. +
  4877. + # Depending on whether sort is multi-byte-patched,
  4878. + # it emits different diagnostics:
  4879. + # non-MB: invalid byte or field list
  4880. + # MB: invalid byte, character or field list
  4881. + # Adjust the expected error output accordingly.
  4882. + if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
  4883. + (@new_t))
  4884. + {
  4885. + my $sub = {ERR_SUBST => 's/, character//'};
  4886. + push @new_t, $sub;
  4887. + push @$t, $sub;
  4888. + }
  4889. + #disable several failing tests until investigation, disable all tests with envvars set
  4890. + next if (grep {ref $_ eq 'HASH' && exists $_->{ENV}} (@new_t));
  4891. + next if ($test_name =~ "18g" or $test_name =~ "sort-numeric" or $test_name =~ "08[ab]" or $test_name =~ "03[def]" or $test_name =~ "h4" or $test_name =~ "n1" or $test_name =~ "2[01]a");
  4892. + next if ($test_name =~ "11[ab]"); # avoid FP: expected result differs to MB result due to collation rules.
  4893. + push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
  4894. + }
  4895. + push @Tests, @new;
  4896. + }
  4897. +
  4898. @Tests = triple_test \@Tests;
  4899. # Remember that triple_test creates from each test with exactly one "IN"
  4900. @@ -432,6 +469,7 @@ foreach my $t (@Tests)
  4901. # Remove the IN_PIPE version of the "output-is-input" test above.
  4902. # The others aren't susceptible because they have three inputs each.
  4903. @Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
  4904. +@Tests = grep {$_->[0] ne 'output-is-input-mb.p'} @Tests;
  4905. my $save_temps = $ENV{DEBUG};
  4906. my $verbose = $ENV{VERBOSE};
  4907. diff -Naurp coreutils-8.27-orig/tests/misc/unexpand.pl coreutils-8.27/tests/misc/unexpand.pl
  4908. --- coreutils-8.27-orig/tests/misc/unexpand.pl 2017-01-01 16:34:24.000000000 -0600
  4909. +++ coreutils-8.27/tests/misc/unexpand.pl 2017-03-11 23:47:13.103285687 -0600
  4910. @@ -27,6 +27,14 @@ my $limits = getlimits ();
  4911. my $prog = 'unexpand';
  4912. +# comment out next line to disable multibyte tests
  4913. +my $mb_locale = $ENV{LOCALE_FR_UTF8};
  4914. +! defined $mb_locale || $mb_locale eq 'none'
  4915. + and $mb_locale = 'C';
  4916. +
  4917. +my $try = "Try \`$prog --help' for more information.\n";
  4918. +my $inval = "$prog: invalid byte, character or field list\n$try";
  4919. +
  4920. my @Tests =
  4921. (
  4922. ['a1', {IN=> ' 'x 1 ."y\n"}, {OUT=> ' 'x 1 ."y\n"}],
  4923. @@ -128,6 +136,37 @@ my @Tests =
  4924. ['ts2', '-t5,8', {IN=>"x\t \t y\n"}, {OUT=>"x\t\t y\n"}],
  4925. );
  4926. +if ($mb_locale ne 'C')
  4927. + {
  4928. + # Duplicate each test vector, appending "-mb" to the test name and
  4929. + # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
  4930. + # provide coverage for the distro-added multi-byte code paths.
  4931. + my @new;
  4932. + foreach my $t (@Tests)
  4933. + {
  4934. + my @new_t = @$t;
  4935. + my $test_name = shift @new_t;
  4936. +
  4937. + # Depending on whether unexpand is multi-byte-patched,
  4938. + # it emits different diagnostics:
  4939. + # non-MB: invalid byte or field list
  4940. + # MB: invalid byte, character or field list
  4941. + # Adjust the expected error output accordingly.
  4942. + if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
  4943. + (@new_t))
  4944. + {
  4945. + my $sub = {ERR_SUBST => 's/, character//'};
  4946. + push @new_t, $sub;
  4947. + push @$t, $sub;
  4948. + }
  4949. + next if ($test_name =~ 'b-1');
  4950. + push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
  4951. + }
  4952. + push @Tests, @new;
  4953. + }
  4954. +
  4955. +@Tests = triple_test \@Tests;
  4956. +
  4957. my $save_temps = $ENV{DEBUG};
  4958. my $verbose = $ENV{VERBOSE};
  4959. diff -Naurp coreutils-8.27-orig/tests/misc/uniq.pl coreutils-8.27/tests/misc/uniq.pl
  4960. --- coreutils-8.27-orig/tests/misc/uniq.pl 2017-01-01 16:34:24.000000000 -0600
  4961. +++ coreutils-8.27/tests/misc/uniq.pl 2017-03-11 23:47:13.103285687 -0600
  4962. @@ -23,9 +23,17 @@ my $limits = getlimits ();
  4963. my $prog = 'uniq';
  4964. my $try = "Try '$prog --help' for more information.\n";
  4965. +my $inval = "$prog: invalid byte, character or field list\n$try";
  4966. +
  4967. # Turn off localization of executable's output.
  4968. @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
  4969. +my $mb_locale;
  4970. +#Comment out next line to disable multibyte tests
  4971. +$mb_locale = $ENV{LOCALE_FR_UTF8};
  4972. +! defined $mb_locale || $mb_locale eq 'none'
  4973. + and $mb_locale = 'C';
  4974. +
  4975. # When possible, create a "-z"-testing variant of each test.
  4976. sub add_z_variants($)
  4977. {
  4978. @@ -262,6 +270,53 @@ foreach my $t (@Tests)
  4979. and push @$t, {ENV=>'_POSIX2_VERSION=199209'};
  4980. }
  4981. +if ($mb_locale ne 'C')
  4982. + {
  4983. + # Duplicate each test vector, appending "-mb" to the test name and
  4984. + # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
  4985. + # provide coverage for the distro-added multi-byte code paths.
  4986. + my @new;
  4987. + foreach my $t (@Tests)
  4988. + {
  4989. + my @new_t = @$t;
  4990. + my $test_name = shift @new_t;
  4991. +
  4992. + # Depending on whether uniq is multi-byte-patched,
  4993. + # it emits different diagnostics:
  4994. + # non-MB: invalid byte or field list
  4995. + # MB: invalid byte, character or field list
  4996. + # Adjust the expected error output accordingly.
  4997. + if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
  4998. + (@new_t))
  4999. + {
  5000. + my $sub = {ERR_SUBST => 's/, character//'};
  5001. + push @new_t, $sub;
  5002. + push @$t, $sub;
  5003. + }
  5004. + # In test #145, replace the each ‘...’ by '...'.
  5005. + if ($test_name =~ "145")
  5006. + {
  5007. + my $sub = { ERR_SUBST => "s/‘([^’]+)’/'\$1'/g"};
  5008. + push @new_t, $sub;
  5009. + push @$t, $sub;
  5010. + }
  5011. + next if ( $test_name =~ "schar"
  5012. + or $test_name =~ "^obs-plus"
  5013. + or $test_name =~ "119");
  5014. + push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
  5015. + }
  5016. + push @Tests, @new;
  5017. + }
  5018. +
  5019. +# Remember that triple_test creates from each test with exactly one "IN"
  5020. +# file two more tests (.p and .r suffix on name) corresponding to reading
  5021. +# input from a file and from a pipe. The pipe-reading test would fail
  5022. +# due to a race condition about 1 in 20 times.
  5023. +# Remove the IN_PIPE version of the "output-is-input" test above.
  5024. +# The others aren't susceptible because they have three inputs each.
  5025. +
  5026. +@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
  5027. +
  5028. @Tests = add_z_variants \@Tests;
  5029. @Tests = triple_test \@Tests;
  5030. diff -Naurp coreutils-8.27-orig/tests/pr/pr-tests.pl coreutils-8.27/tests/pr/pr-tests.pl
  5031. --- coreutils-8.27-orig/tests/pr/pr-tests.pl 2017-01-01 16:34:24.000000000 -0600
  5032. +++ coreutils-8.27/tests/pr/pr-tests.pl 2017-03-11 23:47:13.103285687 -0600
  5033. @@ -24,6 +24,15 @@ use strict;
  5034. my $prog = 'pr';
  5035. my $normalize_strerror = "s/': .*/'/";
  5036. +my $mb_locale;
  5037. +#Uncomment the following line to enable multibyte tests
  5038. +$mb_locale = $ENV{LOCALE_FR_UTF8};
  5039. +! defined $mb_locale || $mb_locale eq 'none'
  5040. + and $mb_locale = 'C';
  5041. +
  5042. +my $try = "Try \`$prog --help' for more information.\n";
  5043. +my $inval = "$prog: invalid byte, character or field list\n$try";
  5044. +
  5045. my @tv = (
  5046. # -b option is no longer an official option. But it's still working to
  5047. @@ -474,8 +483,48 @@ push @Tests,
  5048. {IN=>{2=>"a\n"}},
  5049. {OUT=>"a\t\t\t\t \t\t\ta\n"} ];
  5050. +# Add _POSIX2_VERSION=199209 to the environment of each test
  5051. +# that uses an old-style option like +1.
  5052. +if ($mb_locale ne 'C')
  5053. + {
  5054. + # Duplicate each test vector, appending "-mb" to the test name and
  5055. + # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
  5056. + # provide coverage for the distro-added multi-byte code paths.
  5057. + my @new;
  5058. + foreach my $t (@Tests)
  5059. + {
  5060. + my @new_t = @$t;
  5061. + my $test_name = shift @new_t;
  5062. +
  5063. + # Depending on whether pr is multi-byte-patched,
  5064. + # it emits different diagnostics:
  5065. + # non-MB: invalid byte or field list
  5066. + # MB: invalid byte, character or field list
  5067. + # Adjust the expected error output accordingly.
  5068. + if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
  5069. + (@new_t))
  5070. + {
  5071. + my $sub = {ERR_SUBST => 's/, character//'};
  5072. + push @new_t, $sub;
  5073. + push @$t, $sub;
  5074. + }
  5075. + #temporarily skip some failing tests
  5076. + next if ($test_name =~ "col-0" or $test_name =~ "col-inval" or $test_name =~ "asan1");
  5077. + push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
  5078. + }
  5079. + push @Tests, @new;
  5080. + }
  5081. +
  5082. @Tests = triple_test \@Tests;
  5083. +# Remember that triple_test creates from each test with exactly one "IN"
  5084. +# file two more tests (.p and .r suffix on name) corresponding to reading
  5085. +# input from a file and from a pipe. The pipe-reading test would fail
  5086. +# due to a race condition about 1 in 20 times.
  5087. +# Remove the IN_PIPE version of the "output-is-input" test above.
  5088. +# The others aren't susceptible because they have three inputs each.
  5089. +@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
  5090. +
  5091. my $save_temps = $ENV{DEBUG};
  5092. my $verbose = $ENV{VERBOSE};
  5093. diff -Naurp coreutils-8.27-orig/tests/unexpand/mb.sh coreutils-8.27/tests/unexpand/mb.sh
  5094. --- coreutils-8.27-orig/tests/unexpand/mb.sh 1969-12-31 18:00:00.000000000 -0600
  5095. +++ coreutils-8.27/tests/unexpand/mb.sh 2017-03-11 23:49:06.759133489 -0600
  5096. @@ -0,0 +1,172 @@
  5097. +#!/bin/sh
  5098. +
  5099. +# Copyright (C) 2012-2017 Free Software Foundation, Inc.
  5100. +
  5101. +# This program is free software: you can redistribute it and/or modify
  5102. +# it under the terms of the GNU General Public License as published by
  5103. +# the Free Software Foundation, either version 3 of the License, or
  5104. +# (at your option) any later version.
  5105. +
  5106. +# This program is distributed in the hope that it will be useful,
  5107. +# but WITHOUT ANY WARRANTY; without even the implied warranty of
  5108. +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  5109. +# GNU General Public License for more details.
  5110. +
  5111. +# You should have received a copy of the GNU General Public License
  5112. +# along with this program. If not, see <http://www.gnu.org/licenses/>.
  5113. +
  5114. +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
  5115. +print_ver_ unexpand
  5116. +
  5117. +export LC_ALL=en_US.UTF-8
  5118. +
  5119. +#input containing multibyte characters
  5120. +cat > in <<\EOF
  5121. +1234567812345678123456781
  5122. +. . . .
  5123. +a b c d
  5124. +. . . .
  5125. +ä ö ü ß
  5126. +. . . .
  5127. + äöü . öüä. ä xx
  5128. +EOF
  5129. +
  5130. +cat > exp <<\EOF
  5131. +1234567812345678123456781
  5132. +. . . .
  5133. +a b c d
  5134. +. . . .
  5135. +ä ö ü ß
  5136. +. . . .
  5137. + äöü . öüä. ä xx
  5138. +EOF
  5139. +
  5140. +unexpand -a < in > out || fail=1
  5141. +compare exp out > /dev/null 2>&1 || fail=1
  5142. +
  5143. +
  5144. +#multiple files as an input
  5145. +cat >> exp <<\EOF
  5146. +1234567812345678123456781
  5147. +. . . .
  5148. +a b c d
  5149. +. . . .
  5150. +ä ö ü ß
  5151. +. . . .
  5152. + äöü . öüä. ä xx
  5153. +EOF
  5154. +
  5155. +
  5156. +unexpand -a ./in ./in > out || fail=1
  5157. +compare exp out > /dev/null 2>&1 || fail=1
  5158. +
  5159. +#test characters with a display width larger than 1
  5160. +
  5161. +env printf '12345678
  5162. +e |ascii(1)
  5163. +\u00E9 |composed(1)
  5164. +e\u0301 |decomposed(1)
  5165. +\u3000 |ideo-space(2)
  5166. +\uFF0D |full-hypen(2)
  5167. +' > in || framework_failure_
  5168. +
  5169. +env printf '12345678
  5170. +e\t|ascii(1)
  5171. +\u00E9\t|composed(1)
  5172. +e\u0301\t|decomposed(1)
  5173. +\u3000\t|ideo-space(2)
  5174. +\uFF0D\t|full-hypen(2)
  5175. +' > exp || framework_failure_
  5176. +
  5177. +unexpand -a < in > out || fail=1
  5178. +compare exp out > /dev/null 2>&1 || fail=1
  5179. +
  5180. +#test input where a blank of width > 1 is not being substituted
  5181. +in="$(LC_ALL=en_US.UTF-8 printf ' \u3000 ö ü ß')"
  5182. +exp='   ö ü ß'
  5183. +
  5184. +unexpand -a < in > out || fail=1
  5185. +compare exp out > /dev/null 2>&1 || fail=1
  5186. +
  5187. +#non-Unicode characters interspersed between Unicode ones
  5188. +env printf '12345678
  5189. + \xFF|
  5190. +\xFF |
  5191. + \xFFä|
  5192. +ä\xFF |
  5193. + ä\xFF|
  5194. +\xFF ä|
  5195. +äbcdef\xFF |
  5196. +' > in || framework_failure_
  5197. +
  5198. +env printf '12345678
  5199. +\t\xFF|
  5200. +\xFF\t|
  5201. +\t\xFFä|
  5202. +ä\xFF\t|
  5203. +\tä\xFF|
  5204. +\xFF\tä|
  5205. +äbcdef\xFF\t|
  5206. +' > exp || framework_failure_
  5207. +
  5208. +unexpand -a < in > out || fail=1
  5209. +compare exp out > /dev/null 2>&1 || fail=1
  5210. +
  5211. +#BOM header test 1
  5212. +printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_
  5213. +1234567812345678123456781
  5214. +. . . .
  5215. +a b c d
  5216. +. . . .
  5217. +ä ö ü ß
  5218. +. . . .
  5219. + äöü . öüä. ä xx
  5220. +EOF
  5221. +env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_
  5222. +
  5223. +printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
  5224. +1234567812345678123456781
  5225. +. . . .
  5226. +a b c d
  5227. +. . . .
  5228. +ä ö ü ß
  5229. +. . . .
  5230. + äöü . öüä. ä xx
  5231. +EOF
  5232. +
  5233. +unexpand < in > out || fail=1
  5234. +compare exp out > /dev/null 2>&1 || fail=1
  5235. +
  5236. +LANG=C unexpand < in > out || fail=1
  5237. +compare exp out > /dev/null 2>&1 || fail=1
  5238. +
  5239. +LC_ALL=C unexpand < in > out || fail=1
  5240. +compare exp out > /dev/null 2>&1 || fail=1
  5241. +
  5242. +
  5243. +printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
  5244. +1234567812345678123456781
  5245. +. . . .
  5246. +a b c d
  5247. +. . . .
  5248. +ä ö ü ß
  5249. +. . . .
  5250. + äöü . öüä. ä xx
  5251. +1234567812345678123456781
  5252. +. . . .
  5253. +a b c d
  5254. +. . . .
  5255. +ä ö ü ß
  5256. +. . . .
  5257. + äöü . öüä. ä xx
  5258. +EOF
  5259. +
  5260. +
  5261. +unexpand in in > out || fail=1
  5262. +compare exp out > /dev/null 2>&1 || fail=1
  5263. +
  5264. +LANG=C unexpand in in > out || fail=1
  5265. +compare exp out > /dev/null 2>&1 || fail=1
  5266. +
  5267. +LC_ALL=C unexpand in in > out || fail=1
  5268. +compare exp out > /dev/null 2>&1 || fail=1