display invalid multibyte sequences as Unicode FFFD, take 2; also clean

up the character-parsing functions


git-svn-id: svn://svn.savannah.gnu.org/nano/trunk/nano@2909 35c25a1d-7b9e-4130-9fde-d3aeb78583b8
master
David Lawrence Ramsey 2005-07-21 22:12:03 +00:00
parent 66444c37b3
commit 61f567378a
4 changed files with 66 additions and 51 deletions

View File

@ -78,6 +78,16 @@ CVS code -
out of the prompt, and that magichistory is properly updated out of the prompt, and that magichistory is properly updated
when we change it and then move up. New function when we change it and then move up. New function
history_reset(); changes to nanogetstr(). (DLR) history_reset(); changes to nanogetstr(). (DLR)
- Various character-handling cleanups. If we get an invalid
multibyte sequence, treat it as Unicode FFFD (Replacement
Character), unless we're determining if it's a control
character or searching for a match to it. Also, remove
unneeded variables and checks when parsing multibyte
sequences. Changes to is_alnum_mbchar(), is_blank_mbchar(),
is_cntrl_mbchar(), is_punct_mbchar(), control_mbrep(),
mbwidth(), make_mbchar(), parse_mbchar(), mbstrncasecmp(),
mbstrcasestr(), mbrevstrcasestr(), mbstrchr(), and
display_string(). (DLR)
- chars.c: - chars.c:
mbstrchr() mbstrchr()
- Don't count matches between valid and invalid multibyte - Don't count matches between valid and invalid multibyte
@ -147,6 +157,9 @@ CVS code -
HAVE_SNPRINTF. (DLR) HAVE_SNPRINTF. (DLR)
- Remove TOP from the topmidnone enum, and rename it centernone. - Remove TOP from the topmidnone enum, and rename it centernone.
(DLR) (DLR)
proto.h:
- Add declarations for bad_mbchar and bad_mbchar_len, so that we
can use them in display_string() as well as chars.c. (DLR)
- rcfile.c: - rcfile.c:
nregcomp() nregcomp()
- Return TRUE when the compilation succeeds and FALSE otherwise, - Return TRUE when the compilation succeeds and FALSE otherwise,

View File

@ -37,6 +37,14 @@
#ifdef HAVE_WCTYPE_H #ifdef HAVE_WCTYPE_H
#include <wctype.h> #include <wctype.h>
#endif #endif
static const wchar_t bad_wchar = 0xFFFD;
/* If we get an invalid multibyte sequence, we treat it as
* Unicode FFFD (Replacement Character), unless we're
* determining if it's a control character or searching for a
* match to it. */
const char *bad_mbchar = "\xEF\xBF\xBD";
const int bad_mbchar_len = 3;
#endif #endif
#ifndef HAVE_ISBLANK #ifndef HAVE_ISBLANK
@ -70,11 +78,10 @@ bool is_alnum_mbchar(const char *c)
#ifdef ENABLE_UTF8 #ifdef ENABLE_UTF8
if (ISSET(USE_UTF8)) { if (ISSET(USE_UTF8)) {
wchar_t wc; wchar_t wc;
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
if (c_mb_len <= 0) { if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
mbtowc(NULL, NULL, 0); mbtowc(NULL, NULL, 0);
wc = (unsigned char)*c; wc = bad_wchar;
} }
return iswalnum(wc); return iswalnum(wc);
@ -91,11 +98,10 @@ bool is_blank_mbchar(const char *c)
#ifdef ENABLE_UTF8 #ifdef ENABLE_UTF8
if (ISSET(USE_UTF8)) { if (ISSET(USE_UTF8)) {
wchar_t wc; wchar_t wc;
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
if (c_mb_len <= 0) { if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
mbtowc(NULL, NULL, 0); mbtowc(NULL, NULL, 0);
wc = (unsigned char)*c; wc = bad_wchar;
} }
return iswblank(wc); return iswblank(wc);
@ -132,9 +138,8 @@ bool is_cntrl_mbchar(const char *c)
#ifdef ENABLE_UTF8 #ifdef ENABLE_UTF8
if (ISSET(USE_UTF8)) { if (ISSET(USE_UTF8)) {
wchar_t wc; wchar_t wc;
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
if (c_mb_len <= 0) { if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
mbtowc(NULL, NULL, 0); mbtowc(NULL, NULL, 0);
wc = (unsigned char)*c; wc = (unsigned char)*c;
} }
@ -155,9 +160,9 @@ bool is_punct_mbchar(const char *c)
wchar_t wc; wchar_t wc;
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX); int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
if (c_mb_len <= 0) { if (c_mb_len < 0) {
mbtowc(NULL, NULL, 0); mbtowc(NULL, NULL, 0);
wc = (unsigned char)*c; wc = bad_wchar;
} }
return iswpunct(wc); return iswpunct(wc);
@ -215,17 +220,18 @@ char *control_mbrep(const char *c, char *crep, int *crep_len)
if (ISSET(USE_UTF8)) { if (ISSET(USE_UTF8)) {
wchar_t wc; wchar_t wc;
if (mbtowc(&wc, c, MB_CUR_MAX) <= 0) { if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
mbtowc(NULL, NULL, 0); mbtowc(NULL, NULL, 0);
wc = (unsigned char)*c; crep = (char *)bad_mbchar;
} *crep_len = bad_mbchar_len;
} else {
*crep_len = wctomb(crep, control_wrep(wc)); *crep_len = wctomb(crep, control_wrep(wc));
if (*crep_len <= 0) { if (*crep_len < 0) {
wctomb(NULL, 0); wctomb(NULL, 0);
*crep_len = 0; *crep_len = 0;
} }
}
} else { } else {
#endif #endif
*crep_len = 1; *crep_len = 1;
@ -245,11 +251,11 @@ int mbwidth(const char *c)
#ifdef ENABLE_UTF8 #ifdef ENABLE_UTF8
if (ISSET(USE_UTF8)) { if (ISSET(USE_UTF8)) {
wchar_t wc; wchar_t wc;
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX), width; int width;
if (c_mb_len <= 0) { if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
mbtowc(NULL, NULL, 0); mbtowc(NULL, NULL, 0);
wc = (unsigned char)*c; wc = bad_wchar;
} }
width = wcwidth(wc); width = wcwidth(wc);
@ -289,7 +295,7 @@ char *make_mbchar(int chr, int *chr_mb_len)
chr_mb = charalloc(MB_CUR_MAX); chr_mb = charalloc(MB_CUR_MAX);
*chr_mb_len = wctomb(chr_mb, chr); *chr_mb_len = wctomb(chr_mb, chr);
if (*chr_mb_len <= 0) { if (*chr_mb_len < 0) {
wctomb(NULL, 0); wctomb(NULL, 0);
*chr_mb_len = 0; *chr_mb_len = 0;
} }
@ -324,15 +330,15 @@ int parse_mbchar(const char *buf, char *chr, bool *bad_chr, size_t
/* Get the number of bytes in the multibyte character. */ /* Get the number of bytes in the multibyte character. */
buf_mb_len = mblen(buf, MB_CUR_MAX); buf_mb_len = mblen(buf, MB_CUR_MAX);
/* If buf contains a null byte or an invalid multibyte /* If buf contains an invalid multibyte character, set bad_chr
* character, set bad_chr to TRUE (if it contains the latter) * to TRUE and interpret buf's first byte. */
* and interpret buf's first byte. */ if (buf_mb_len < 0) {
if (buf_mb_len <= 0) {
mblen(NULL, 0); mblen(NULL, 0);
if (buf_mb_len < 0 && bad_chr != NULL) if (bad_chr != NULL)
*bad_chr = TRUE; *bad_chr = TRUE;
buf_mb_len = 1; buf_mb_len = 1;
} } else if (buf_mb_len == 0)
buf_mb_len++;
/* Save the multibyte character in chr. */ /* Save the multibyte character in chr. */
if (chr != NULL) { if (chr != NULL) {
@ -480,7 +486,7 @@ int mbstrncasecmp(const char *s1, const char *s2, size_t n)
s1_mb_len = parse_mbchar(s1, s1_mb, NULL, NULL); s1_mb_len = parse_mbchar(s1, s1_mb, NULL, NULL);
if (mbtowc(&ws1, s1_mb, s1_mb_len) <= 0) { if (mbtowc(&ws1, s1_mb, s1_mb_len) < 0) {
mbtowc(NULL, NULL, 0); mbtowc(NULL, NULL, 0);
ws1 = (unsigned char)*s1_mb; ws1 = (unsigned char)*s1_mb;
bad_s1_mb = TRUE; bad_s1_mb = TRUE;
@ -488,7 +494,7 @@ int mbstrncasecmp(const char *s1, const char *s2, size_t n)
s2_mb_len = parse_mbchar(s2, s2_mb, NULL, NULL); s2_mb_len = parse_mbchar(s2, s2_mb, NULL, NULL);
if (mbtowc(&ws2, s2_mb, s2_mb_len) <= 0) { if (mbtowc(&ws2, s2_mb, s2_mb_len) < 0) {
mbtowc(NULL, NULL, 0); mbtowc(NULL, NULL, 0);
ws2 = (unsigned char)*s2_mb; ws2 = (unsigned char)*s2_mb;
bad_s2_mb = TRUE; bad_s2_mb = TRUE;
@ -554,7 +560,7 @@ const char *mbstrcasestr(const char *haystack, const char *needle)
r_mb_len = parse_mbchar(r, r_mb, NULL, NULL); r_mb_len = parse_mbchar(r, r_mb, NULL, NULL);
if (mbtowc(&wr, r_mb, r_mb_len) <= 0) { if (mbtowc(&wr, r_mb, r_mb_len) < 0) {
mbtowc(NULL, NULL, 0); mbtowc(NULL, NULL, 0);
wr = (unsigned char)*r; wr = (unsigned char)*r;
bad_r_mb = TRUE; bad_r_mb = TRUE;
@ -562,7 +568,7 @@ const char *mbstrcasestr(const char *haystack, const char *needle)
q_mb_len = parse_mbchar(q, q_mb, NULL, NULL); q_mb_len = parse_mbchar(q, q_mb, NULL, NULL);
if (mbtowc(&wq, q_mb, q_mb_len) <= 0) { if (mbtowc(&wq, q_mb, q_mb_len) < 0) {
mbtowc(NULL, NULL, 0); mbtowc(NULL, NULL, 0);
wq = (unsigned char)*q; wq = (unsigned char)*q;
bad_q_mb = TRUE; bad_q_mb = TRUE;
@ -660,7 +666,7 @@ const char *mbrevstrcasestr(const char *haystack, const char *needle,
r_mb_len = parse_mbchar(r, r_mb, NULL, NULL); r_mb_len = parse_mbchar(r, r_mb, NULL, NULL);
if (mbtowc(&wr, r_mb, r_mb_len) <= 0) { if (mbtowc(&wr, r_mb, r_mb_len) < 0) {
mbtowc(NULL, NULL, 0); mbtowc(NULL, NULL, 0);
wr = (unsigned char)*r; wr = (unsigned char)*r;
bad_r_mb = TRUE; bad_r_mb = TRUE;
@ -668,7 +674,7 @@ const char *mbrevstrcasestr(const char *haystack, const char *needle,
q_mb_len = parse_mbchar(q, q_mb, NULL, NULL); q_mb_len = parse_mbchar(q, q_mb, NULL, NULL);
if (mbtowc(&wq, q_mb, q_mb_len) <= 0) { if (mbtowc(&wq, q_mb, q_mb_len) < 0) {
mbtowc(NULL, NULL, 0); mbtowc(NULL, NULL, 0);
wq = (unsigned char)*q; wq = (unsigned char)*q;
bad_q_mb = TRUE; bad_q_mb = TRUE;
@ -766,7 +772,7 @@ char *mbstrchr(const char *s, char *c)
wchar_t ws, wc; wchar_t ws, wc;
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX); int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
if (c_mb_len <= 0) { if (c_mb_len < 0) {
mbtowc(NULL, NULL, 0); mbtowc(NULL, NULL, 0);
wc = (unsigned char)*c; wc = (unsigned char)*c;
bad_c_mb = TRUE; bad_c_mb = TRUE;
@ -775,7 +781,7 @@ char *mbstrchr(const char *s, char *c)
while (*s != '\0') { while (*s != '\0') {
int s_mb_len = parse_mbchar(s, s_mb, NULL, NULL); int s_mb_len = parse_mbchar(s, s_mb, NULL, NULL);
if (mbtowc(&ws, s_mb, s_mb_len) <= 0) { if (mbtowc(&ws, s_mb, s_mb_len) < 0) {
mbtowc(NULL, NULL, 0); mbtowc(NULL, NULL, 0);
ws = (unsigned char)*s; ws = (unsigned char)*s;
bad_s_mb = TRUE; bad_s_mb = TRUE;

View File

@ -138,6 +138,11 @@ extern bool curses_ended;
extern char *homedir; extern char *homedir;
#ifdef ENABLE_UTF8
extern const char *bad_mbchar;
extern const int bad_mbchar_len;
#endif
/* The functions we want available. */ /* The functions we want available. */
/* Public functions in chars.c. */ /* Public functions in chars.c. */

View File

@ -2365,8 +2365,8 @@ char *display_string(const char *buf, size_t start_col, size_t len, bool
start_col++; start_col++;
} }
/* If buf contains a control character, interpret it. If buf /* If buf contains a control character, interpret it. If buf
* contains an invalid multibyte control character, interpret * contains an invalid multibyte control character, display it
* it as though it's a normal control character.*/ * as such.*/
} else if (is_cntrl_mbchar(buf_mb)) { } else if (is_cntrl_mbchar(buf_mb)) {
char *ctrl_buf_mb = charalloc(mb_cur_max()); char *ctrl_buf_mb = charalloc(mb_cur_max());
int ctrl_buf_mb_len, i; int ctrl_buf_mb_len, i;
@ -2402,21 +2402,12 @@ char *display_string(const char *buf, size_t start_col, size_t len, bool
#ifdef ENABLE_UTF8 #ifdef ENABLE_UTF8
/* If buf contains an invalid multibyte non-control /* If buf contains an invalid multibyte non-control
* character, interpret it as though it's a normal * character, display it as such. */
* non-control character. */
if (ISSET(USE_UTF8) && bad_char) { if (ISSET(USE_UTF8) && bad_char) {
char *bad_buf_mb; for (i = 0; i < bad_mbchar_len; i++)
int bad_buf_mb_len; converted[index++] = bad_mbchar[i];
bad_buf_mb = make_mbchar((unsigned char)*buf_mb, start_col += mbwidth(bad_mbchar);
&bad_buf_mb_len);
for (i = 0; i < bad_buf_mb_len; i++)
converted[index++] = bad_buf_mb[i];
start_col += mbwidth(bad_buf_mb);
free(bad_buf_mb);
} else { } else {
#endif #endif
for (i = 0; i < buf_mb_len; i++) for (i = 0; i < buf_mb_len; i++)