tweaks: avoid parsing a multibyte character twice

The number of bytes in the character were determined twice: first in
mbwidth() and then in char_length().  Do it just once, in mbtowide().

Also, avoid calling is_cntrl_char(), because it does unneeded checks
when we already know that the high bit is set.

This duplicates some code, but advance_over() is called a lot, so it
is important that it is as fast as possible.

This shouldn't slow down plain ASCII, as the extra checks (use_utf8
and *string < 0xA0) are done only for non-ASCII (apart from DEL).
master
Benno Schulenberg 2021-04-09 10:52:29 +02:00
parent f11931a0dd
commit 78f92e044a
1 changed files with 19 additions and 10 deletions

View File

@ -334,13 +334,26 @@ int collect_char(const char *string, char *thechar)
int advance_over(const char *string, size_t *column) int advance_over(const char *string, size_t *column)
{ {
#ifdef ENABLE_UTF8 #ifdef ENABLE_UTF8
if ((signed char)*string < 0) { if ((signed char)*string < 0 && use_utf8) {
if (is_cntrl_char(string)) /* A UTF-8 upper control code has two bytes and takes two columns. */
if (((unsigned char)string[0] == 0xC2 && (signed char)string[1] < -96)) {
*column += 2; *column += 2;
else return 2;
*column += mbwidth(string); } else {
wchar_t wc;
int charlen = mbtowide(&wc, string);
return char_length(string); if (charlen < 0) {
*column += 1;
return 1;
}
int width = wcwidth(wc);
*column += (width < 0) ? 1 : width;
return charlen;
}
} }
#endif #endif
@ -349,12 +362,8 @@ int advance_over(const char *string, size_t *column)
*column += tabsize - *column % tabsize; *column += tabsize - *column % tabsize;
else else
*column += 2; *column += 2;
} else if (*string == 0x7F) } else if (0x7E < (unsigned char)*string && (unsigned char)*string < 0xA0)
*column += 2; *column += 2;
#ifndef ENABLE_UTF8
else if (0x7F < (unsigned char)*string && (unsigned char)*string < 0xA0)
*column += 2;
#endif
else else
*column += 1; *column += 1;