chars: work around a UTF-8 bug in glibc, to display invalid codes right
The mblen() and mbtowc() functions will happily return 4 or 5 or 6 for byte sequences that start with 0xF4 0x90 or higher. But those sequences encode for U+110000 or higher, which are not valid Unicode code points. The libc of FreeBSD and OpenBSD and Alpine correctly return -1 for such sequences. Make nano behave correctly also when linked against glibc, so that invalid sequences are always presented as a series of invalid bytes and never as a single invalid code. This fixes https://savannah.gnu.org/bugs/?60262. Bug existed since before version 2.0.0.master
parent
66d9d6c6d2
commit
929770191e
56
src/chars.c
56
src/chars.c
|
@ -51,7 +51,7 @@ bool is_alpha_char(const char *c)
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
wchar_t wc;
|
wchar_t wc;
|
||||||
|
|
||||||
if (mbtowc(&wc, c, MAXCHARLEN) < 0)
|
if (mbtowide(&wc, c) < 0)
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
|
||||||
return iswalpha(wc);
|
return iswalpha(wc);
|
||||||
|
@ -67,7 +67,7 @@ bool is_alnum_char(const char *c)
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
wchar_t wc;
|
wchar_t wc;
|
||||||
|
|
||||||
if (mbtowc(&wc, c, MAXCHARLEN) < 0)
|
if (mbtowide(&wc, c) < 0)
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
|
||||||
return iswalnum(wc);
|
return iswalnum(wc);
|
||||||
|
@ -85,7 +85,7 @@ bool is_blank_char(const char *c)
|
||||||
if ((signed char)*c >= 0)
|
if ((signed char)*c >= 0)
|
||||||
return (*c == ' ' || *c == '\t');
|
return (*c == ' ' || *c == '\t');
|
||||||
|
|
||||||
if (mbtowc(&wc, c, MAXCHARLEN) < 0)
|
if (mbtowide(&wc, c) < 0)
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
|
||||||
return iswblank(wc);
|
return iswblank(wc);
|
||||||
|
@ -112,7 +112,7 @@ bool is_punct_char(const char *c)
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
wchar_t wc;
|
wchar_t wc;
|
||||||
|
|
||||||
if (mbtowc(&wc, c, MAXCHARLEN) < 0)
|
if (mbtowide(&wc, c) < 0)
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
|
||||||
return iswpunct(wc);
|
return iswpunct(wc);
|
||||||
|
@ -176,6 +176,18 @@ char control_mbrep(const char *c, bool isdata)
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
|
/* Convert the given multibyte sequence c to wide character wc, and return
|
||||||
|
* the number of bytes in the sequence, or -1 for an invalid sequence. */
|
||||||
|
int mbtowide(wchar_t *wc, const char *c)
|
||||||
|
{
|
||||||
|
int count = mbtowc(wc, c, MAXCHARLEN);
|
||||||
|
|
||||||
|
if (count < 0 || *wc > 0x10FFFF)
|
||||||
|
return -1;
|
||||||
|
else
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
/* Return the width in columns of the given (multibyte) character. */
|
/* Return the width in columns of the given (multibyte) character. */
|
||||||
int mbwidth(const char *c)
|
int mbwidth(const char *c)
|
||||||
{
|
{
|
||||||
|
@ -184,7 +196,7 @@ int mbwidth(const char *c)
|
||||||
wchar_t wc;
|
wchar_t wc;
|
||||||
int width;
|
int width;
|
||||||
|
|
||||||
if (mbtowc(&wc, c, MAXCHARLEN) < 0)
|
if (mbtowide(&wc, c) < 0)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
width = wcwidth(wc);
|
width = wcwidth(wc);
|
||||||
|
@ -226,9 +238,14 @@ int char_length(const char *pointer)
|
||||||
{
|
{
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
/* If possibly a multibyte character, get its length; otherwise, it's 1. */
|
/* If possibly a multibyte character, get its length; otherwise, it's 1. */
|
||||||
if ((unsigned char)*pointer > 0xC1) {
|
if ((unsigned char)*pointer > 0xC1 && use_utf8) {
|
||||||
int length = mblen(pointer, MAXCHARLEN);
|
int length = mblen(pointer, MAXCHARLEN);
|
||||||
|
|
||||||
|
/* Codes beyond U+10FFFF are invalid, even when glibc thinks otherwise. */
|
||||||
|
if ((unsigned char)*pointer > 0xF4 || ((unsigned char)*pointer == 0xF4 &&
|
||||||
|
(unsigned char)*(pointer + 1) > 0x8F))
|
||||||
|
return 1;
|
||||||
|
|
||||||
return (length < 0 ? 1 : length);
|
return (length < 0 ? 1 : length);
|
||||||
} else
|
} else
|
||||||
#endif
|
#endif
|
||||||
|
@ -243,9 +260,7 @@ size_t mbstrlen(const char *pointer)
|
||||||
while (*pointer != '\0') {
|
while (*pointer != '\0') {
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
if ((unsigned char)*pointer > 0xC1) {
|
if ((unsigned char)*pointer > 0xC1) {
|
||||||
int length = mblen(pointer, MAXCHARLEN);
|
pointer += char_length(pointer);
|
||||||
|
|
||||||
pointer += (length < 0 ? 1 : length);
|
|
||||||
} else
|
} else
|
||||||
#endif
|
#endif
|
||||||
pointer++;
|
pointer++;
|
||||||
|
@ -265,11 +280,7 @@ int collect_char(const char *string, char *thechar)
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
/* If this is a UTF-8 starter byte, get the number of bytes of the character. */
|
/* If this is a UTF-8 starter byte, get the number of bytes of the character. */
|
||||||
if ((unsigned char)*string > 0xC1) {
|
if ((unsigned char)*string > 0xC1) {
|
||||||
charlen = mblen(string, MAXCHARLEN);
|
charlen = char_length(string);
|
||||||
|
|
||||||
/* When the multibyte sequence is invalid, only take the first byte. */
|
|
||||||
if (charlen <= 0)
|
|
||||||
charlen = 1;
|
|
||||||
} else
|
} else
|
||||||
#endif
|
#endif
|
||||||
charlen = 1;
|
charlen = 1;
|
||||||
|
@ -286,19 +297,12 @@ int advance_over(const char *string, size_t *column)
|
||||||
{
|
{
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
if ((signed char)*string < 0) {
|
if ((signed char)*string < 0) {
|
||||||
int charlen = mblen(string, MAXCHARLEN);
|
|
||||||
|
|
||||||
if (charlen > 0) {
|
|
||||||
if (is_cntrl_char(string))
|
if (is_cntrl_char(string))
|
||||||
*column += 2;
|
*column += 2;
|
||||||
else
|
else
|
||||||
*column += mbwidth(string);
|
*column += mbwidth(string);
|
||||||
} else {
|
|
||||||
charlen = 1;
|
|
||||||
*column += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
return charlen;
|
return char_length(string);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -395,8 +399,8 @@ int mbstrncasecmp(const char *s1, const char *s2, size_t n)
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool bad1 = (mbtowc(&wc1, s1, MAXCHARLEN) < 0);
|
bool bad1 = (mbtowide(&wc1, s1) < 0);
|
||||||
bool bad2 = (mbtowc(&wc2, s2, MAXCHARLEN) < 0);
|
bool bad2 = (mbtowide(&wc2, s2) < 0);
|
||||||
|
|
||||||
if (bad1 || bad2) {
|
if (bad1 || bad2) {
|
||||||
if (*s1 != *s2)
|
if (*s1 != *s2)
|
||||||
|
@ -521,13 +525,13 @@ char *mbstrchr(const char *string, const char *chr)
|
||||||
bool bad_s = FALSE, bad_c = FALSE;
|
bool bad_s = FALSE, bad_c = FALSE;
|
||||||
wchar_t ws, wc;
|
wchar_t ws, wc;
|
||||||
|
|
||||||
if (mbtowc(&wc, chr, MAXCHARLEN) < 0) {
|
if (mbtowide(&wc, chr) < 0) {
|
||||||
wc = (unsigned char)*chr;
|
wc = (unsigned char)*chr;
|
||||||
bad_c = TRUE;
|
bad_c = TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
while (*string != '\0') {
|
while (*string != '\0') {
|
||||||
int symlen = mbtowc(&ws, string, MAXCHARLEN);
|
int symlen = mbtowide(&ws, string);
|
||||||
|
|
||||||
if (symlen < 0) {
|
if (symlen < 0) {
|
||||||
ws = (unsigned char)*string;
|
ws = (unsigned char)*string;
|
||||||
|
|
|
@ -204,6 +204,7 @@ bool is_cntrl_char(const char *c);
|
||||||
bool is_word_char(const char *c, bool allow_punct);
|
bool is_word_char(const char *c, bool allow_punct);
|
||||||
char control_mbrep(const char *c, bool isdata);
|
char control_mbrep(const char *c, bool isdata);
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
|
int mbtowide(wchar_t *wc, const char *c);
|
||||||
int mbwidth(const char *c);
|
int mbwidth(const char *c);
|
||||||
bool is_zerowidth(const char *ch);
|
bool is_zerowidth(const char *ch);
|
||||||
char *make_mbchar(long code, int *length);
|
char *make_mbchar(long code, int *length);
|
||||||
|
|
12
src/winio.c
12
src/winio.c
|
@ -1820,14 +1820,14 @@ char *display_string(const char *buf, size_t column, size_t span,
|
||||||
wchar_t wc;
|
wchar_t wc;
|
||||||
|
|
||||||
/* Convert a multibyte character to a single code. */
|
/* Convert a multibyte character to a single code. */
|
||||||
charlength = mbtowc(&wc, buf, MAXCHARLEN);
|
charlength = mbtowide(&wc, buf);
|
||||||
|
|
||||||
/* Represent an invalid character with the Replacement Character. */
|
/* Represent an invalid character with the Replacement Character. */
|
||||||
if (charlength < 0 || wc > 0x10FFFF) {
|
if (charlength < 0) {
|
||||||
converted[index++] = '\xEF';
|
converted[index++] = '\xEF';
|
||||||
converted[index++] = '\xBF';
|
converted[index++] = '\xBF';
|
||||||
converted[index++] = '\xBD';
|
converted[index++] = '\xBD';
|
||||||
buf += (charlength > 0 ? charlength : 1);
|
buf++;
|
||||||
column++;
|
column++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -2151,7 +2151,7 @@ void minibar(void)
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
else if ((unsigned char)*this_position < 0x80 && using_utf8())
|
else if ((unsigned char)*this_position < 0x80 && using_utf8())
|
||||||
sprintf(hexadecimal, "U+%04X", (unsigned char)*this_position);
|
sprintf(hexadecimal, "U+%04X", (unsigned char)*this_position);
|
||||||
else if (using_utf8() && mbtowc(&widecode, this_position, MAXCHARLEN) >= 0)
|
else if (using_utf8() && mbtowide(&widecode, this_position) > 0)
|
||||||
sprintf(hexadecimal, "U+%04X", (int)widecode);
|
sprintf(hexadecimal, "U+%04X", (int)widecode);
|
||||||
#endif
|
#endif
|
||||||
else
|
else
|
||||||
|
@ -2163,13 +2163,13 @@ void minibar(void)
|
||||||
successor = this_position + char_length(this_position);
|
successor = this_position + char_length(this_position);
|
||||||
|
|
||||||
if (*this_position && *successor && is_zerowidth(successor) &&
|
if (*this_position && *successor && is_zerowidth(successor) &&
|
||||||
mbtowc(&widecode, successor, MAXCHARLEN) >= 0) {
|
mbtowide(&widecode, successor) > 0) {
|
||||||
sprintf(hexadecimal, "|%04X", (int)widecode);
|
sprintf(hexadecimal, "|%04X", (int)widecode);
|
||||||
waddstr(bottomwin, hexadecimal);
|
waddstr(bottomwin, hexadecimal);
|
||||||
|
|
||||||
successor += char_length(successor);
|
successor += char_length(successor);
|
||||||
|
|
||||||
if (is_zerowidth(successor) && mbtowc(&widecode, successor, MAXCHARLEN) >= 0) {
|
if (is_zerowidth(successor) && mbtowide(&widecode, successor) > 0) {
|
||||||
sprintf(hexadecimal, "|%04X", (int)widecode);
|
sprintf(hexadecimal, "|%04X", (int)widecode);
|
||||||
waddstr(bottomwin, hexadecimal);
|
waddstr(bottomwin, hexadecimal);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue