From 929770191eb0f71125c2bb9dcee2665e6d947a94 Mon Sep 17 00:00:00 2001 From: Benno Schulenberg Date: Thu, 25 Mar 2021 17:28:51 +0100 Subject: [PATCH] chars: work around a UTF-8 bug in glibc, to display invalid codes right The mblen() and mbtowc() functions will happily return 4 or 5 or 6 for byte sequences that start with 0xF4 0x90 or higher. But those sequences encode for U+110000 or higher, which are not valid Unicode code points. The libc of FreeBSD and OpenBSD and Alpine correctly return -1 for such sequences. Make nano behave correctly also when linked against glibc, so that invalid sequences are always presented as a series of invalid bytes and never as a single invalid code. This fixes https://savannah.gnu.org/bugs/?60262. Bug existed since before version 2.0.0. --- src/chars.c | 56 ++++++++++++++++++++++++++---------------------- src/prototypes.h | 1 + src/winio.c | 12 +++++------ 3 files changed, 37 insertions(+), 32 deletions(-) diff --git a/src/chars.c b/src/chars.c index 90f3f495..67edc889 100644 --- a/src/chars.c +++ b/src/chars.c @@ -51,7 +51,7 @@ bool is_alpha_char(const char *c) #ifdef ENABLE_UTF8 wchar_t wc; - if (mbtowc(&wc, c, MAXCHARLEN) < 0) + if (mbtowide(&wc, c) < 0) return FALSE; return iswalpha(wc); @@ -67,7 +67,7 @@ bool is_alnum_char(const char *c) #ifdef ENABLE_UTF8 wchar_t wc; - if (mbtowc(&wc, c, MAXCHARLEN) < 0) + if (mbtowide(&wc, c) < 0) return FALSE; return iswalnum(wc); @@ -85,7 +85,7 @@ bool is_blank_char(const char *c) if ((signed char)*c >= 0) return (*c == ' ' || *c == '\t'); - if (mbtowc(&wc, c, MAXCHARLEN) < 0) + if (mbtowide(&wc, c) < 0) return FALSE; return iswblank(wc); @@ -112,7 +112,7 @@ bool is_punct_char(const char *c) #ifdef ENABLE_UTF8 wchar_t wc; - if (mbtowc(&wc, c, MAXCHARLEN) < 0) + if (mbtowide(&wc, c) < 0) return FALSE; return iswpunct(wc); @@ -176,6 +176,18 @@ char control_mbrep(const char *c, bool isdata) } #ifdef ENABLE_UTF8 +/* Convert the given multibyte sequence c to wide character wc, and return + * the number of bytes in the sequence, or -1 for an invalid sequence. */ +int mbtowide(wchar_t *wc, const char *c) +{ + int count = mbtowc(wc, c, MAXCHARLEN); + + if (count < 0 || *wc > 0x10FFFF) + return -1; + else + return count; +} + /* Return the width in columns of the given (multibyte) character. */ int mbwidth(const char *c) { @@ -184,7 +196,7 @@ int mbwidth(const char *c) wchar_t wc; int width; - if (mbtowc(&wc, c, MAXCHARLEN) < 0) + if (mbtowide(&wc, c) < 0) return 1; width = wcwidth(wc); @@ -226,9 +238,14 @@ int char_length(const char *pointer) { #ifdef ENABLE_UTF8 /* If possibly a multibyte character, get its length; otherwise, it's 1. */ - if ((unsigned char)*pointer > 0xC1) { + if ((unsigned char)*pointer > 0xC1 && use_utf8) { int length = mblen(pointer, MAXCHARLEN); + /* Codes beyond U+10FFFF are invalid, even when glibc thinks otherwise. */ + if ((unsigned char)*pointer > 0xF4 || ((unsigned char)*pointer == 0xF4 && + (unsigned char)*(pointer + 1) > 0x8F)) + return 1; + return (length < 0 ? 1 : length); } else #endif @@ -243,9 +260,7 @@ size_t mbstrlen(const char *pointer) while (*pointer != '\0') { #ifdef ENABLE_UTF8 if ((unsigned char)*pointer > 0xC1) { - int length = mblen(pointer, MAXCHARLEN); - - pointer += (length < 0 ? 1 : length); + pointer += char_length(pointer); } else #endif pointer++; @@ -265,11 +280,7 @@ int collect_char(const char *string, char *thechar) #ifdef ENABLE_UTF8 /* If this is a UTF-8 starter byte, get the number of bytes of the character. */ if ((unsigned char)*string > 0xC1) { - charlen = mblen(string, MAXCHARLEN); - - /* When the multibyte sequence is invalid, only take the first byte. */ - if (charlen <= 0) - charlen = 1; + charlen = char_length(string); } else #endif charlen = 1; @@ -286,19 +297,12 @@ int advance_over(const char *string, size_t *column) { #ifdef ENABLE_UTF8 if ((signed char)*string < 0) { - int charlen = mblen(string, MAXCHARLEN); - - if (charlen > 0) { if (is_cntrl_char(string)) *column += 2; else *column += mbwidth(string); - } else { - charlen = 1; - *column += 1; - } - return charlen; + return char_length(string); } #endif @@ -395,8 +399,8 @@ int mbstrncasecmp(const char *s1, const char *s2, size_t n) continue; } - bool bad1 = (mbtowc(&wc1, s1, MAXCHARLEN) < 0); - bool bad2 = (mbtowc(&wc2, s2, MAXCHARLEN) < 0); + bool bad1 = (mbtowide(&wc1, s1) < 0); + bool bad2 = (mbtowide(&wc2, s2) < 0); if (bad1 || bad2) { if (*s1 != *s2) @@ -521,13 +525,13 @@ char *mbstrchr(const char *string, const char *chr) bool bad_s = FALSE, bad_c = FALSE; wchar_t ws, wc; - if (mbtowc(&wc, chr, MAXCHARLEN) < 0) { + if (mbtowide(&wc, chr) < 0) { wc = (unsigned char)*chr; bad_c = TRUE; } while (*string != '\0') { - int symlen = mbtowc(&ws, string, MAXCHARLEN); + int symlen = mbtowide(&ws, string); if (symlen < 0) { ws = (unsigned char)*string; diff --git a/src/prototypes.h b/src/prototypes.h index 5180d27d..f7bdc993 100644 --- a/src/prototypes.h +++ b/src/prototypes.h @@ -204,6 +204,7 @@ bool is_cntrl_char(const char *c); bool is_word_char(const char *c, bool allow_punct); char control_mbrep(const char *c, bool isdata); #ifdef ENABLE_UTF8 +int mbtowide(wchar_t *wc, const char *c); int mbwidth(const char *c); bool is_zerowidth(const char *ch); char *make_mbchar(long code, int *length); diff --git a/src/winio.c b/src/winio.c index 59a3c775..0afc5133 100644 --- a/src/winio.c +++ b/src/winio.c @@ -1820,14 +1820,14 @@ char *display_string(const char *buf, size_t column, size_t span, wchar_t wc; /* Convert a multibyte character to a single code. */ - charlength = mbtowc(&wc, buf, MAXCHARLEN); + charlength = mbtowide(&wc, buf); /* Represent an invalid character with the Replacement Character. */ - if (charlength < 0 || wc > 0x10FFFF) { + if (charlength < 0) { converted[index++] = '\xEF'; converted[index++] = '\xBF'; converted[index++] = '\xBD'; - buf += (charlength > 0 ? charlength : 1); + buf++; column++; continue; } @@ -2151,7 +2151,7 @@ void minibar(void) #ifdef ENABLE_UTF8 else if ((unsigned char)*this_position < 0x80 && using_utf8()) sprintf(hexadecimal, "U+%04X", (unsigned char)*this_position); - else if (using_utf8() && mbtowc(&widecode, this_position, MAXCHARLEN) >= 0) + else if (using_utf8() && mbtowide(&widecode, this_position) > 0) sprintf(hexadecimal, "U+%04X", (int)widecode); #endif else @@ -2163,13 +2163,13 @@ void minibar(void) successor = this_position + char_length(this_position); if (*this_position && *successor && is_zerowidth(successor) && - mbtowc(&widecode, successor, MAXCHARLEN) >= 0) { + mbtowide(&widecode, successor) > 0) { sprintf(hexadecimal, "|%04X", (int)widecode); waddstr(bottomwin, hexadecimal); successor += char_length(successor); - if (is_zerowidth(successor) && mbtowc(&widecode, successor, MAXCHARLEN) >= 0) { + if (is_zerowidth(successor) && mbtowide(&widecode, successor) > 0) { sprintf(hexadecimal, "|%04X", (int)widecode); waddstr(bottomwin, hexadecimal); }