display invalid multibyte sequences as Unicode FFFD, take 2; also clean
up the character-parsing functions git-svn-id: svn://svn.savannah.gnu.org/nano/trunk/nano@2909 35c25a1d-7b9e-4130-9fde-d3aeb78583b8master
parent
66444c37b3
commit
61f567378a
13
ChangeLog
13
ChangeLog
|
@ -78,6 +78,16 @@ CVS code -
|
||||||
out of the prompt, and that magichistory is properly updated
|
out of the prompt, and that magichistory is properly updated
|
||||||
when we change it and then move up. New function
|
when we change it and then move up. New function
|
||||||
history_reset(); changes to nanogetstr(). (DLR)
|
history_reset(); changes to nanogetstr(). (DLR)
|
||||||
|
- Various character-handling cleanups. If we get an invalid
|
||||||
|
multibyte sequence, treat it as Unicode FFFD (Replacement
|
||||||
|
Character), unless we're determining if it's a control
|
||||||
|
character or searching for a match to it. Also, remove
|
||||||
|
unneeded variables and checks when parsing multibyte
|
||||||
|
sequences. Changes to is_alnum_mbchar(), is_blank_mbchar(),
|
||||||
|
is_cntrl_mbchar(), is_punct_mbchar(), control_mbrep(),
|
||||||
|
mbwidth(), make_mbchar(), parse_mbchar(), mbstrncasecmp(),
|
||||||
|
mbstrcasestr(), mbrevstrcasestr(), mbstrchr(), and
|
||||||
|
display_string(). (DLR)
|
||||||
- chars.c:
|
- chars.c:
|
||||||
mbstrchr()
|
mbstrchr()
|
||||||
- Don't count matches between valid and invalid multibyte
|
- Don't count matches between valid and invalid multibyte
|
||||||
|
@ -147,6 +157,9 @@ CVS code -
|
||||||
HAVE_SNPRINTF. (DLR)
|
HAVE_SNPRINTF. (DLR)
|
||||||
- Remove TOP from the topmidnone enum, and rename it centernone.
|
- Remove TOP from the topmidnone enum, and rename it centernone.
|
||||||
(DLR)
|
(DLR)
|
||||||
|
proto.h:
|
||||||
|
- Add declarations for bad_mbchar and bad_mbchar_len, so that we
|
||||||
|
can use them in display_string() as well as chars.c. (DLR)
|
||||||
- rcfile.c:
|
- rcfile.c:
|
||||||
nregcomp()
|
nregcomp()
|
||||||
- Return TRUE when the compilation succeeds and FALSE otherwise,
|
- Return TRUE when the compilation succeeds and FALSE otherwise,
|
||||||
|
|
72
src/chars.c
72
src/chars.c
|
@ -37,6 +37,14 @@
|
||||||
#ifdef HAVE_WCTYPE_H
|
#ifdef HAVE_WCTYPE_H
|
||||||
#include <wctype.h>
|
#include <wctype.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static const wchar_t bad_wchar = 0xFFFD;
|
||||||
|
/* If we get an invalid multibyte sequence, we treat it as
|
||||||
|
* Unicode FFFD (Replacement Character), unless we're
|
||||||
|
* determining if it's a control character or searching for a
|
||||||
|
* match to it. */
|
||||||
|
const char *bad_mbchar = "\xEF\xBF\xBD";
|
||||||
|
const int bad_mbchar_len = 3;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef HAVE_ISBLANK
|
#ifndef HAVE_ISBLANK
|
||||||
|
@ -70,11 +78,10 @@ bool is_alnum_mbchar(const char *c)
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
if (ISSET(USE_UTF8)) {
|
if (ISSET(USE_UTF8)) {
|
||||||
wchar_t wc;
|
wchar_t wc;
|
||||||
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
|
|
||||||
|
|
||||||
if (c_mb_len <= 0) {
|
if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
|
||||||
mbtowc(NULL, NULL, 0);
|
mbtowc(NULL, NULL, 0);
|
||||||
wc = (unsigned char)*c;
|
wc = bad_wchar;
|
||||||
}
|
}
|
||||||
|
|
||||||
return iswalnum(wc);
|
return iswalnum(wc);
|
||||||
|
@ -91,11 +98,10 @@ bool is_blank_mbchar(const char *c)
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
if (ISSET(USE_UTF8)) {
|
if (ISSET(USE_UTF8)) {
|
||||||
wchar_t wc;
|
wchar_t wc;
|
||||||
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
|
|
||||||
|
|
||||||
if (c_mb_len <= 0) {
|
if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
|
||||||
mbtowc(NULL, NULL, 0);
|
mbtowc(NULL, NULL, 0);
|
||||||
wc = (unsigned char)*c;
|
wc = bad_wchar;
|
||||||
}
|
}
|
||||||
|
|
||||||
return iswblank(wc);
|
return iswblank(wc);
|
||||||
|
@ -132,9 +138,8 @@ bool is_cntrl_mbchar(const char *c)
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
if (ISSET(USE_UTF8)) {
|
if (ISSET(USE_UTF8)) {
|
||||||
wchar_t wc;
|
wchar_t wc;
|
||||||
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
|
|
||||||
|
|
||||||
if (c_mb_len <= 0) {
|
if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
|
||||||
mbtowc(NULL, NULL, 0);
|
mbtowc(NULL, NULL, 0);
|
||||||
wc = (unsigned char)*c;
|
wc = (unsigned char)*c;
|
||||||
}
|
}
|
||||||
|
@ -155,9 +160,9 @@ bool is_punct_mbchar(const char *c)
|
||||||
wchar_t wc;
|
wchar_t wc;
|
||||||
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
|
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
|
||||||
|
|
||||||
if (c_mb_len <= 0) {
|
if (c_mb_len < 0) {
|
||||||
mbtowc(NULL, NULL, 0);
|
mbtowc(NULL, NULL, 0);
|
||||||
wc = (unsigned char)*c;
|
wc = bad_wchar;
|
||||||
}
|
}
|
||||||
|
|
||||||
return iswpunct(wc);
|
return iswpunct(wc);
|
||||||
|
@ -215,17 +220,18 @@ char *control_mbrep(const char *c, char *crep, int *crep_len)
|
||||||
if (ISSET(USE_UTF8)) {
|
if (ISSET(USE_UTF8)) {
|
||||||
wchar_t wc;
|
wchar_t wc;
|
||||||
|
|
||||||
if (mbtowc(&wc, c, MB_CUR_MAX) <= 0) {
|
if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
|
||||||
mbtowc(NULL, NULL, 0);
|
mbtowc(NULL, NULL, 0);
|
||||||
wc = (unsigned char)*c;
|
crep = (char *)bad_mbchar;
|
||||||
}
|
*crep_len = bad_mbchar_len;
|
||||||
|
} else {
|
||||||
*crep_len = wctomb(crep, control_wrep(wc));
|
*crep_len = wctomb(crep, control_wrep(wc));
|
||||||
|
|
||||||
if (*crep_len <= 0) {
|
if (*crep_len < 0) {
|
||||||
wctomb(NULL, 0);
|
wctomb(NULL, 0);
|
||||||
*crep_len = 0;
|
*crep_len = 0;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
#endif
|
#endif
|
||||||
*crep_len = 1;
|
*crep_len = 1;
|
||||||
|
@ -245,11 +251,11 @@ int mbwidth(const char *c)
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
if (ISSET(USE_UTF8)) {
|
if (ISSET(USE_UTF8)) {
|
||||||
wchar_t wc;
|
wchar_t wc;
|
||||||
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX), width;
|
int width;
|
||||||
|
|
||||||
if (c_mb_len <= 0) {
|
if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
|
||||||
mbtowc(NULL, NULL, 0);
|
mbtowc(NULL, NULL, 0);
|
||||||
wc = (unsigned char)*c;
|
wc = bad_wchar;
|
||||||
}
|
}
|
||||||
|
|
||||||
width = wcwidth(wc);
|
width = wcwidth(wc);
|
||||||
|
@ -289,7 +295,7 @@ char *make_mbchar(int chr, int *chr_mb_len)
|
||||||
chr_mb = charalloc(MB_CUR_MAX);
|
chr_mb = charalloc(MB_CUR_MAX);
|
||||||
*chr_mb_len = wctomb(chr_mb, chr);
|
*chr_mb_len = wctomb(chr_mb, chr);
|
||||||
|
|
||||||
if (*chr_mb_len <= 0) {
|
if (*chr_mb_len < 0) {
|
||||||
wctomb(NULL, 0);
|
wctomb(NULL, 0);
|
||||||
*chr_mb_len = 0;
|
*chr_mb_len = 0;
|
||||||
}
|
}
|
||||||
|
@ -324,15 +330,15 @@ int parse_mbchar(const char *buf, char *chr, bool *bad_chr, size_t
|
||||||
/* Get the number of bytes in the multibyte character. */
|
/* Get the number of bytes in the multibyte character. */
|
||||||
buf_mb_len = mblen(buf, MB_CUR_MAX);
|
buf_mb_len = mblen(buf, MB_CUR_MAX);
|
||||||
|
|
||||||
/* If buf contains a null byte or an invalid multibyte
|
/* If buf contains an invalid multibyte character, set bad_chr
|
||||||
* character, set bad_chr to TRUE (if it contains the latter)
|
* to TRUE and interpret buf's first byte. */
|
||||||
* and interpret buf's first byte. */
|
if (buf_mb_len < 0) {
|
||||||
if (buf_mb_len <= 0) {
|
|
||||||
mblen(NULL, 0);
|
mblen(NULL, 0);
|
||||||
if (buf_mb_len < 0 && bad_chr != NULL)
|
if (bad_chr != NULL)
|
||||||
*bad_chr = TRUE;
|
*bad_chr = TRUE;
|
||||||
buf_mb_len = 1;
|
buf_mb_len = 1;
|
||||||
}
|
} else if (buf_mb_len == 0)
|
||||||
|
buf_mb_len++;
|
||||||
|
|
||||||
/* Save the multibyte character in chr. */
|
/* Save the multibyte character in chr. */
|
||||||
if (chr != NULL) {
|
if (chr != NULL) {
|
||||||
|
@ -480,7 +486,7 @@ int mbstrncasecmp(const char *s1, const char *s2, size_t n)
|
||||||
|
|
||||||
s1_mb_len = parse_mbchar(s1, s1_mb, NULL, NULL);
|
s1_mb_len = parse_mbchar(s1, s1_mb, NULL, NULL);
|
||||||
|
|
||||||
if (mbtowc(&ws1, s1_mb, s1_mb_len) <= 0) {
|
if (mbtowc(&ws1, s1_mb, s1_mb_len) < 0) {
|
||||||
mbtowc(NULL, NULL, 0);
|
mbtowc(NULL, NULL, 0);
|
||||||
ws1 = (unsigned char)*s1_mb;
|
ws1 = (unsigned char)*s1_mb;
|
||||||
bad_s1_mb = TRUE;
|
bad_s1_mb = TRUE;
|
||||||
|
@ -488,7 +494,7 @@ int mbstrncasecmp(const char *s1, const char *s2, size_t n)
|
||||||
|
|
||||||
s2_mb_len = parse_mbchar(s2, s2_mb, NULL, NULL);
|
s2_mb_len = parse_mbchar(s2, s2_mb, NULL, NULL);
|
||||||
|
|
||||||
if (mbtowc(&ws2, s2_mb, s2_mb_len) <= 0) {
|
if (mbtowc(&ws2, s2_mb, s2_mb_len) < 0) {
|
||||||
mbtowc(NULL, NULL, 0);
|
mbtowc(NULL, NULL, 0);
|
||||||
ws2 = (unsigned char)*s2_mb;
|
ws2 = (unsigned char)*s2_mb;
|
||||||
bad_s2_mb = TRUE;
|
bad_s2_mb = TRUE;
|
||||||
|
@ -554,7 +560,7 @@ const char *mbstrcasestr(const char *haystack, const char *needle)
|
||||||
|
|
||||||
r_mb_len = parse_mbchar(r, r_mb, NULL, NULL);
|
r_mb_len = parse_mbchar(r, r_mb, NULL, NULL);
|
||||||
|
|
||||||
if (mbtowc(&wr, r_mb, r_mb_len) <= 0) {
|
if (mbtowc(&wr, r_mb, r_mb_len) < 0) {
|
||||||
mbtowc(NULL, NULL, 0);
|
mbtowc(NULL, NULL, 0);
|
||||||
wr = (unsigned char)*r;
|
wr = (unsigned char)*r;
|
||||||
bad_r_mb = TRUE;
|
bad_r_mb = TRUE;
|
||||||
|
@ -562,7 +568,7 @@ const char *mbstrcasestr(const char *haystack, const char *needle)
|
||||||
|
|
||||||
q_mb_len = parse_mbchar(q, q_mb, NULL, NULL);
|
q_mb_len = parse_mbchar(q, q_mb, NULL, NULL);
|
||||||
|
|
||||||
if (mbtowc(&wq, q_mb, q_mb_len) <= 0) {
|
if (mbtowc(&wq, q_mb, q_mb_len) < 0) {
|
||||||
mbtowc(NULL, NULL, 0);
|
mbtowc(NULL, NULL, 0);
|
||||||
wq = (unsigned char)*q;
|
wq = (unsigned char)*q;
|
||||||
bad_q_mb = TRUE;
|
bad_q_mb = TRUE;
|
||||||
|
@ -660,7 +666,7 @@ const char *mbrevstrcasestr(const char *haystack, const char *needle,
|
||||||
|
|
||||||
r_mb_len = parse_mbchar(r, r_mb, NULL, NULL);
|
r_mb_len = parse_mbchar(r, r_mb, NULL, NULL);
|
||||||
|
|
||||||
if (mbtowc(&wr, r_mb, r_mb_len) <= 0) {
|
if (mbtowc(&wr, r_mb, r_mb_len) < 0) {
|
||||||
mbtowc(NULL, NULL, 0);
|
mbtowc(NULL, NULL, 0);
|
||||||
wr = (unsigned char)*r;
|
wr = (unsigned char)*r;
|
||||||
bad_r_mb = TRUE;
|
bad_r_mb = TRUE;
|
||||||
|
@ -668,7 +674,7 @@ const char *mbrevstrcasestr(const char *haystack, const char *needle,
|
||||||
|
|
||||||
q_mb_len = parse_mbchar(q, q_mb, NULL, NULL);
|
q_mb_len = parse_mbchar(q, q_mb, NULL, NULL);
|
||||||
|
|
||||||
if (mbtowc(&wq, q_mb, q_mb_len) <= 0) {
|
if (mbtowc(&wq, q_mb, q_mb_len) < 0) {
|
||||||
mbtowc(NULL, NULL, 0);
|
mbtowc(NULL, NULL, 0);
|
||||||
wq = (unsigned char)*q;
|
wq = (unsigned char)*q;
|
||||||
bad_q_mb = TRUE;
|
bad_q_mb = TRUE;
|
||||||
|
@ -766,7 +772,7 @@ char *mbstrchr(const char *s, char *c)
|
||||||
wchar_t ws, wc;
|
wchar_t ws, wc;
|
||||||
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
|
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
|
||||||
|
|
||||||
if (c_mb_len <= 0) {
|
if (c_mb_len < 0) {
|
||||||
mbtowc(NULL, NULL, 0);
|
mbtowc(NULL, NULL, 0);
|
||||||
wc = (unsigned char)*c;
|
wc = (unsigned char)*c;
|
||||||
bad_c_mb = TRUE;
|
bad_c_mb = TRUE;
|
||||||
|
@ -775,7 +781,7 @@ char *mbstrchr(const char *s, char *c)
|
||||||
while (*s != '\0') {
|
while (*s != '\0') {
|
||||||
int s_mb_len = parse_mbchar(s, s_mb, NULL, NULL);
|
int s_mb_len = parse_mbchar(s, s_mb, NULL, NULL);
|
||||||
|
|
||||||
if (mbtowc(&ws, s_mb, s_mb_len) <= 0) {
|
if (mbtowc(&ws, s_mb, s_mb_len) < 0) {
|
||||||
mbtowc(NULL, NULL, 0);
|
mbtowc(NULL, NULL, 0);
|
||||||
ws = (unsigned char)*s;
|
ws = (unsigned char)*s;
|
||||||
bad_s_mb = TRUE;
|
bad_s_mb = TRUE;
|
||||||
|
|
|
@ -138,6 +138,11 @@ extern bool curses_ended;
|
||||||
|
|
||||||
extern char *homedir;
|
extern char *homedir;
|
||||||
|
|
||||||
|
#ifdef ENABLE_UTF8
|
||||||
|
extern const char *bad_mbchar;
|
||||||
|
extern const int bad_mbchar_len;
|
||||||
|
#endif
|
||||||
|
|
||||||
/* The functions we want available. */
|
/* The functions we want available. */
|
||||||
|
|
||||||
/* Public functions in chars.c. */
|
/* Public functions in chars.c. */
|
||||||
|
|
21
src/winio.c
21
src/winio.c
|
@ -2365,8 +2365,8 @@ char *display_string(const char *buf, size_t start_col, size_t len, bool
|
||||||
start_col++;
|
start_col++;
|
||||||
}
|
}
|
||||||
/* If buf contains a control character, interpret it. If buf
|
/* If buf contains a control character, interpret it. If buf
|
||||||
* contains an invalid multibyte control character, interpret
|
* contains an invalid multibyte control character, display it
|
||||||
* it as though it's a normal control character.*/
|
* as such.*/
|
||||||
} else if (is_cntrl_mbchar(buf_mb)) {
|
} else if (is_cntrl_mbchar(buf_mb)) {
|
||||||
char *ctrl_buf_mb = charalloc(mb_cur_max());
|
char *ctrl_buf_mb = charalloc(mb_cur_max());
|
||||||
int ctrl_buf_mb_len, i;
|
int ctrl_buf_mb_len, i;
|
||||||
|
@ -2402,21 +2402,12 @@ char *display_string(const char *buf, size_t start_col, size_t len, bool
|
||||||
|
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
/* If buf contains an invalid multibyte non-control
|
/* If buf contains an invalid multibyte non-control
|
||||||
* character, interpret it as though it's a normal
|
* character, display it as such. */
|
||||||
* non-control character. */
|
|
||||||
if (ISSET(USE_UTF8) && bad_char) {
|
if (ISSET(USE_UTF8) && bad_char) {
|
||||||
char *bad_buf_mb;
|
for (i = 0; i < bad_mbchar_len; i++)
|
||||||
int bad_buf_mb_len;
|
converted[index++] = bad_mbchar[i];
|
||||||
|
|
||||||
bad_buf_mb = make_mbchar((unsigned char)*buf_mb,
|
start_col += mbwidth(bad_mbchar);
|
||||||
&bad_buf_mb_len);
|
|
||||||
|
|
||||||
for (i = 0; i < bad_buf_mb_len; i++)
|
|
||||||
converted[index++] = bad_buf_mb[i];
|
|
||||||
|
|
||||||
start_col += mbwidth(bad_buf_mb);
|
|
||||||
|
|
||||||
free(bad_buf_mb);
|
|
||||||
} else {
|
} else {
|
||||||
#endif
|
#endif
|
||||||
for (i = 0; i < buf_mb_len; i++)
|
for (i = 0; i < buf_mb_len; i++)
|
||||||
|
|
Loading…
Reference in New Issue