chars: speed up the handling of invalid UTF-8 starter bytes
The first byte of a multi-byte UTF-8 sequence must be in the range 0xC2...0xFF. Any other byte cannot be a starter byte and can thus immediately be treated as a single byte.master
parent
8c406bc875
commit
5129e718d7
14
src/chars.c
14
src/chars.c
|
@ -180,7 +180,7 @@ char control_mbrep(const char *c, bool isdata)
|
||||||
int mbwidth(const char *c)
|
int mbwidth(const char *c)
|
||||||
{
|
{
|
||||||
/* Ask for the width only when the character isn't plain ASCII. */
|
/* Ask for the width only when the character isn't plain ASCII. */
|
||||||
if ((signed char)*c <= 0) {
|
if ((unsigned char)*c > 0xC1) {
|
||||||
wchar_t wc;
|
wchar_t wc;
|
||||||
int width;
|
int width;
|
||||||
|
|
||||||
|
@ -227,7 +227,7 @@ int char_length(const char *pointer)
|
||||||
{
|
{
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
/* If possibly a multibyte character, get its length; otherwise, it's 1. */
|
/* If possibly a multibyte character, get its length; otherwise, it's 1. */
|
||||||
if ((signed char)*pointer < 0) {
|
if ((unsigned char)*pointer > 0xC1) {
|
||||||
int length = mblen(pointer, MAXCHARLEN);
|
int length = mblen(pointer, MAXCHARLEN);
|
||||||
|
|
||||||
return (length < 0 ? 1 : length);
|
return (length < 0 ? 1 : length);
|
||||||
|
@ -243,7 +243,7 @@ size_t mbstrlen(const char *pointer)
|
||||||
|
|
||||||
while (*pointer != '\0') {
|
while (*pointer != '\0') {
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
if ((signed char)*pointer < 0) {
|
if ((unsigned char)*pointer > 0xC1) {
|
||||||
int length = mblen(pointer, MAXCHARLEN);
|
int length = mblen(pointer, MAXCHARLEN);
|
||||||
|
|
||||||
pointer += (length < 0 ? 1 : length);
|
pointer += (length < 0 ? 1 : length);
|
||||||
|
@ -265,7 +265,7 @@ int collect_char(const char *string, char *thechar)
|
||||||
|
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
/* If this is a UTF-8 starter byte, get the number of bytes of the character. */
|
/* If this is a UTF-8 starter byte, get the number of bytes of the character. */
|
||||||
if ((signed char)*string < 0) {
|
if ((unsigned char)*string > 0xC1) {
|
||||||
charlen = mblen(string, MAXCHARLEN);
|
charlen = mblen(string, MAXCHARLEN);
|
||||||
|
|
||||||
/* When the multibyte sequence is invalid, only take the first byte. */
|
/* When the multibyte sequence is invalid, only take the first byte. */
|
||||||
|
@ -286,7 +286,7 @@ int collect_char(const char *string, char *thechar)
|
||||||
int advance_over(const char *string, size_t *column)
|
int advance_over(const char *string, size_t *column)
|
||||||
{
|
{
|
||||||
#ifdef ENABLE_UTF8
|
#ifdef ENABLE_UTF8
|
||||||
if ((signed char)*string < 0) {
|
if ((unsigned char)*string > 0xC1) {
|
||||||
int charlen = mblen(string, MAXCHARLEN);
|
int charlen = mblen(string, MAXCHARLEN);
|
||||||
|
|
||||||
if (charlen > 0) {
|
if (charlen > 0) {
|
||||||
|
@ -310,10 +310,8 @@ int advance_over(const char *string, size_t *column)
|
||||||
*column += 2;
|
*column += 2;
|
||||||
} else if (*string == 0x7F)
|
} else if (*string == 0x7F)
|
||||||
*column += 2;
|
*column += 2;
|
||||||
#ifndef ENABLE_UTF8
|
else if (!use_utf8 && 0x7F < (unsigned char)*string && (unsigned char)*string < 0xA0)
|
||||||
else if (0x7F < (unsigned char)*string && (unsigned char)*string < 0xA0)
|
|
||||||
*column += 2;
|
*column += 2;
|
||||||
#endif
|
|
||||||
else
|
else
|
||||||
*column += 1;
|
*column += 1;
|
||||||
|
|
||||||
|
|
|
@ -2141,7 +2141,7 @@ void minibar(void)
|
||||||
sprintf(hexadecimal, openfile->current->next ? "U+000A" : "------");
|
sprintf(hexadecimal, openfile->current->next ? "U+000A" : "------");
|
||||||
else if (*this_position == '\n')
|
else if (*this_position == '\n')
|
||||||
sprintf(hexadecimal, "U+0000");
|
sprintf(hexadecimal, "U+0000");
|
||||||
else if ((unsigned char)*this_position >= 0x80 &&
|
else if ((unsigned char)*this_position > 0xC1 &&
|
||||||
mbtowc(&widecode, this_position, MAXCHARLEN) >= 0)
|
mbtowc(&widecode, this_position, MAXCHARLEN) >= 0)
|
||||||
sprintf(hexadecimal, "U+%04X", widecode);
|
sprintf(hexadecimal, "U+%04X", widecode);
|
||||||
else
|
else
|
||||||
|
|
Loading…
Reference in New Issue