display invalid multibyte sequences as Unicode FFFD, take 2; also clean
up the character-parsing functions git-svn-id: svn://svn.savannah.gnu.org/nano/trunk/nano@2909 35c25a1d-7b9e-4130-9fde-d3aeb78583b8master
parent
66444c37b3
commit
61f567378a
13
ChangeLog
13
ChangeLog
|
@ -78,6 +78,16 @@ CVS code -
|
|||
out of the prompt, and that magichistory is properly updated
|
||||
when we change it and then move up. New function
|
||||
history_reset(); changes to nanogetstr(). (DLR)
|
||||
- Various character-handling cleanups. If we get an invalid
|
||||
multibyte sequence, treat it as Unicode FFFD (Replacement
|
||||
Character), unless we're determining if it's a control
|
||||
character or searching for a match to it. Also, remove
|
||||
unneeded variables and checks when parsing multibyte
|
||||
sequences. Changes to is_alnum_mbchar(), is_blank_mbchar(),
|
||||
is_cntrl_mbchar(), is_punct_mbchar(), control_mbrep(),
|
||||
mbwidth(), make_mbchar(), parse_mbchar(), mbstrncasecmp(),
|
||||
mbstrcasestr(), mbrevstrcasestr(), mbstrchr(), and
|
||||
display_string(). (DLR)
|
||||
- chars.c:
|
||||
mbstrchr()
|
||||
- Don't count matches between valid and invalid multibyte
|
||||
|
@ -147,6 +157,9 @@ CVS code -
|
|||
HAVE_SNPRINTF. (DLR)
|
||||
- Remove TOP from the topmidnone enum, and rename it centernone.
|
||||
(DLR)
|
||||
proto.h:
|
||||
- Add declarations for bad_mbchar and bad_mbchar_len, so that we
|
||||
can use them in display_string() as well as chars.c. (DLR)
|
||||
- rcfile.c:
|
||||
nregcomp()
|
||||
- Return TRUE when the compilation succeeds and FALSE otherwise,
|
||||
|
|
72
src/chars.c
72
src/chars.c
|
@ -37,6 +37,14 @@
|
|||
#ifdef HAVE_WCTYPE_H
|
||||
#include <wctype.h>
|
||||
#endif
|
||||
|
||||
static const wchar_t bad_wchar = 0xFFFD;
|
||||
/* If we get an invalid multibyte sequence, we treat it as
|
||||
* Unicode FFFD (Replacement Character), unless we're
|
||||
* determining if it's a control character or searching for a
|
||||
* match to it. */
|
||||
const char *bad_mbchar = "\xEF\xBF\xBD";
|
||||
const int bad_mbchar_len = 3;
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_ISBLANK
|
||||
|
@ -70,11 +78,10 @@ bool is_alnum_mbchar(const char *c)
|
|||
#ifdef ENABLE_UTF8
|
||||
if (ISSET(USE_UTF8)) {
|
||||
wchar_t wc;
|
||||
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
|
||||
|
||||
if (c_mb_len <= 0) {
|
||||
if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
|
||||
mbtowc(NULL, NULL, 0);
|
||||
wc = (unsigned char)*c;
|
||||
wc = bad_wchar;
|
||||
}
|
||||
|
||||
return iswalnum(wc);
|
||||
|
@ -91,11 +98,10 @@ bool is_blank_mbchar(const char *c)
|
|||
#ifdef ENABLE_UTF8
|
||||
if (ISSET(USE_UTF8)) {
|
||||
wchar_t wc;
|
||||
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
|
||||
|
||||
if (c_mb_len <= 0) {
|
||||
if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
|
||||
mbtowc(NULL, NULL, 0);
|
||||
wc = (unsigned char)*c;
|
||||
wc = bad_wchar;
|
||||
}
|
||||
|
||||
return iswblank(wc);
|
||||
|
@ -132,9 +138,8 @@ bool is_cntrl_mbchar(const char *c)
|
|||
#ifdef ENABLE_UTF8
|
||||
if (ISSET(USE_UTF8)) {
|
||||
wchar_t wc;
|
||||
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
|
||||
|
||||
if (c_mb_len <= 0) {
|
||||
if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
|
||||
mbtowc(NULL, NULL, 0);
|
||||
wc = (unsigned char)*c;
|
||||
}
|
||||
|
@ -155,9 +160,9 @@ bool is_punct_mbchar(const char *c)
|
|||
wchar_t wc;
|
||||
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
|
||||
|
||||
if (c_mb_len <= 0) {
|
||||
if (c_mb_len < 0) {
|
||||
mbtowc(NULL, NULL, 0);
|
||||
wc = (unsigned char)*c;
|
||||
wc = bad_wchar;
|
||||
}
|
||||
|
||||
return iswpunct(wc);
|
||||
|
@ -215,17 +220,18 @@ char *control_mbrep(const char *c, char *crep, int *crep_len)
|
|||
if (ISSET(USE_UTF8)) {
|
||||
wchar_t wc;
|
||||
|
||||
if (mbtowc(&wc, c, MB_CUR_MAX) <= 0) {
|
||||
if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
|
||||
mbtowc(NULL, NULL, 0);
|
||||
wc = (unsigned char)*c;
|
||||
}
|
||||
|
||||
crep = (char *)bad_mbchar;
|
||||
*crep_len = bad_mbchar_len;
|
||||
} else {
|
||||
*crep_len = wctomb(crep, control_wrep(wc));
|
||||
|
||||
if (*crep_len <= 0) {
|
||||
if (*crep_len < 0) {
|
||||
wctomb(NULL, 0);
|
||||
*crep_len = 0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#endif
|
||||
*crep_len = 1;
|
||||
|
@ -245,11 +251,11 @@ int mbwidth(const char *c)
|
|||
#ifdef ENABLE_UTF8
|
||||
if (ISSET(USE_UTF8)) {
|
||||
wchar_t wc;
|
||||
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX), width;
|
||||
int width;
|
||||
|
||||
if (c_mb_len <= 0) {
|
||||
if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
|
||||
mbtowc(NULL, NULL, 0);
|
||||
wc = (unsigned char)*c;
|
||||
wc = bad_wchar;
|
||||
}
|
||||
|
||||
width = wcwidth(wc);
|
||||
|
@ -289,7 +295,7 @@ char *make_mbchar(int chr, int *chr_mb_len)
|
|||
chr_mb = charalloc(MB_CUR_MAX);
|
||||
*chr_mb_len = wctomb(chr_mb, chr);
|
||||
|
||||
if (*chr_mb_len <= 0) {
|
||||
if (*chr_mb_len < 0) {
|
||||
wctomb(NULL, 0);
|
||||
*chr_mb_len = 0;
|
||||
}
|
||||
|
@ -324,15 +330,15 @@ int parse_mbchar(const char *buf, char *chr, bool *bad_chr, size_t
|
|||
/* Get the number of bytes in the multibyte character. */
|
||||
buf_mb_len = mblen(buf, MB_CUR_MAX);
|
||||
|
||||
/* If buf contains a null byte or an invalid multibyte
|
||||
* character, set bad_chr to TRUE (if it contains the latter)
|
||||
* and interpret buf's first byte. */
|
||||
if (buf_mb_len <= 0) {
|
||||
/* If buf contains an invalid multibyte character, set bad_chr
|
||||
* to TRUE and interpret buf's first byte. */
|
||||
if (buf_mb_len < 0) {
|
||||
mblen(NULL, 0);
|
||||
if (buf_mb_len < 0 && bad_chr != NULL)
|
||||
if (bad_chr != NULL)
|
||||
*bad_chr = TRUE;
|
||||
buf_mb_len = 1;
|
||||
}
|
||||
} else if (buf_mb_len == 0)
|
||||
buf_mb_len++;
|
||||
|
||||
/* Save the multibyte character in chr. */
|
||||
if (chr != NULL) {
|
||||
|
@ -480,7 +486,7 @@ int mbstrncasecmp(const char *s1, const char *s2, size_t n)
|
|||
|
||||
s1_mb_len = parse_mbchar(s1, s1_mb, NULL, NULL);
|
||||
|
||||
if (mbtowc(&ws1, s1_mb, s1_mb_len) <= 0) {
|
||||
if (mbtowc(&ws1, s1_mb, s1_mb_len) < 0) {
|
||||
mbtowc(NULL, NULL, 0);
|
||||
ws1 = (unsigned char)*s1_mb;
|
||||
bad_s1_mb = TRUE;
|
||||
|
@ -488,7 +494,7 @@ int mbstrncasecmp(const char *s1, const char *s2, size_t n)
|
|||
|
||||
s2_mb_len = parse_mbchar(s2, s2_mb, NULL, NULL);
|
||||
|
||||
if (mbtowc(&ws2, s2_mb, s2_mb_len) <= 0) {
|
||||
if (mbtowc(&ws2, s2_mb, s2_mb_len) < 0) {
|
||||
mbtowc(NULL, NULL, 0);
|
||||
ws2 = (unsigned char)*s2_mb;
|
||||
bad_s2_mb = TRUE;
|
||||
|
@ -554,7 +560,7 @@ const char *mbstrcasestr(const char *haystack, const char *needle)
|
|||
|
||||
r_mb_len = parse_mbchar(r, r_mb, NULL, NULL);
|
||||
|
||||
if (mbtowc(&wr, r_mb, r_mb_len) <= 0) {
|
||||
if (mbtowc(&wr, r_mb, r_mb_len) < 0) {
|
||||
mbtowc(NULL, NULL, 0);
|
||||
wr = (unsigned char)*r;
|
||||
bad_r_mb = TRUE;
|
||||
|
@ -562,7 +568,7 @@ const char *mbstrcasestr(const char *haystack, const char *needle)
|
|||
|
||||
q_mb_len = parse_mbchar(q, q_mb, NULL, NULL);
|
||||
|
||||
if (mbtowc(&wq, q_mb, q_mb_len) <= 0) {
|
||||
if (mbtowc(&wq, q_mb, q_mb_len) < 0) {
|
||||
mbtowc(NULL, NULL, 0);
|
||||
wq = (unsigned char)*q;
|
||||
bad_q_mb = TRUE;
|
||||
|
@ -660,7 +666,7 @@ const char *mbrevstrcasestr(const char *haystack, const char *needle,
|
|||
|
||||
r_mb_len = parse_mbchar(r, r_mb, NULL, NULL);
|
||||
|
||||
if (mbtowc(&wr, r_mb, r_mb_len) <= 0) {
|
||||
if (mbtowc(&wr, r_mb, r_mb_len) < 0) {
|
||||
mbtowc(NULL, NULL, 0);
|
||||
wr = (unsigned char)*r;
|
||||
bad_r_mb = TRUE;
|
||||
|
@ -668,7 +674,7 @@ const char *mbrevstrcasestr(const char *haystack, const char *needle,
|
|||
|
||||
q_mb_len = parse_mbchar(q, q_mb, NULL, NULL);
|
||||
|
||||
if (mbtowc(&wq, q_mb, q_mb_len) <= 0) {
|
||||
if (mbtowc(&wq, q_mb, q_mb_len) < 0) {
|
||||
mbtowc(NULL, NULL, 0);
|
||||
wq = (unsigned char)*q;
|
||||
bad_q_mb = TRUE;
|
||||
|
@ -766,7 +772,7 @@ char *mbstrchr(const char *s, char *c)
|
|||
wchar_t ws, wc;
|
||||
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
|
||||
|
||||
if (c_mb_len <= 0) {
|
||||
if (c_mb_len < 0) {
|
||||
mbtowc(NULL, NULL, 0);
|
||||
wc = (unsigned char)*c;
|
||||
bad_c_mb = TRUE;
|
||||
|
@ -775,7 +781,7 @@ char *mbstrchr(const char *s, char *c)
|
|||
while (*s != '\0') {
|
||||
int s_mb_len = parse_mbchar(s, s_mb, NULL, NULL);
|
||||
|
||||
if (mbtowc(&ws, s_mb, s_mb_len) <= 0) {
|
||||
if (mbtowc(&ws, s_mb, s_mb_len) < 0) {
|
||||
mbtowc(NULL, NULL, 0);
|
||||
ws = (unsigned char)*s;
|
||||
bad_s_mb = TRUE;
|
||||
|
|
|
@ -138,6 +138,11 @@ extern bool curses_ended;
|
|||
|
||||
extern char *homedir;
|
||||
|
||||
#ifdef ENABLE_UTF8
|
||||
extern const char *bad_mbchar;
|
||||
extern const int bad_mbchar_len;
|
||||
#endif
|
||||
|
||||
/* The functions we want available. */
|
||||
|
||||
/* Public functions in chars.c. */
|
||||
|
|
21
src/winio.c
21
src/winio.c
|
@ -2365,8 +2365,8 @@ char *display_string(const char *buf, size_t start_col, size_t len, bool
|
|||
start_col++;
|
||||
}
|
||||
/* If buf contains a control character, interpret it. If buf
|
||||
* contains an invalid multibyte control character, interpret
|
||||
* it as though it's a normal control character.*/
|
||||
* contains an invalid multibyte control character, display it
|
||||
* as such.*/
|
||||
} else if (is_cntrl_mbchar(buf_mb)) {
|
||||
char *ctrl_buf_mb = charalloc(mb_cur_max());
|
||||
int ctrl_buf_mb_len, i;
|
||||
|
@ -2402,21 +2402,12 @@ char *display_string(const char *buf, size_t start_col, size_t len, bool
|
|||
|
||||
#ifdef ENABLE_UTF8
|
||||
/* If buf contains an invalid multibyte non-control
|
||||
* character, interpret it as though it's a normal
|
||||
* non-control character. */
|
||||
* character, display it as such. */
|
||||
if (ISSET(USE_UTF8) && bad_char) {
|
||||
char *bad_buf_mb;
|
||||
int bad_buf_mb_len;
|
||||
for (i = 0; i < bad_mbchar_len; i++)
|
||||
converted[index++] = bad_mbchar[i];
|
||||
|
||||
bad_buf_mb = make_mbchar((unsigned char)*buf_mb,
|
||||
&bad_buf_mb_len);
|
||||
|
||||
for (i = 0; i < bad_buf_mb_len; i++)
|
||||
converted[index++] = bad_buf_mb[i];
|
||||
|
||||
start_col += mbwidth(bad_buf_mb);
|
||||
|
||||
free(bad_buf_mb);
|
||||
start_col += mbwidth(bad_mbchar);
|
||||
} else {
|
||||
#endif
|
||||
for (i = 0; i < buf_mb_len; i++)
|
||||
|
|
Loading…
Reference in New Issue