From 78f92e044a1dcaf2a74f7379fa3307e0a4525523 Mon Sep 17 00:00:00 2001 From: Benno Schulenberg Date: Fri, 9 Apr 2021 10:52:29 +0200 Subject: [PATCH] tweaks: avoid parsing a multibyte character twice The number of bytes in the character were determined twice: first in mbwidth() and then in char_length(). Do it just once, in mbtowide(). Also, avoid calling is_cntrl_char(), because it does unneeded checks when we already know that the high bit is set. This duplicates some code, but advance_over() is called a lot, so it is important that it is as fast as possible. This shouldn't slow down plain ASCII, as the extra checks (use_utf8 and *string < 0xA0) are done only for non-ASCII (apart from DEL). --- src/chars.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/src/chars.c b/src/chars.c index 2a923215..620d2b26 100644 --- a/src/chars.c +++ b/src/chars.c @@ -334,13 +334,26 @@ int collect_char(const char *string, char *thechar) int advance_over(const char *string, size_t *column) { #ifdef ENABLE_UTF8 - if ((signed char)*string < 0) { - if (is_cntrl_char(string)) + if ((signed char)*string < 0 && use_utf8) { + /* A UTF-8 upper control code has two bytes and takes two columns. */ + if (((unsigned char)string[0] == 0xC2 && (signed char)string[1] < -96)) { *column += 2; - else - *column += mbwidth(string); + return 2; + } else { + wchar_t wc; + int charlen = mbtowide(&wc, string); - return char_length(string); + if (charlen < 0) { + *column += 1; + return 1; + } + + int width = wcwidth(wc); + + *column += (width < 0) ? 1 : width; + + return charlen; + } } #endif @@ -349,12 +362,8 @@ int advance_over(const char *string, size_t *column) *column += tabsize - *column % tabsize; else *column += 2; - } else if (*string == 0x7F) + } else if (0x7E < (unsigned char)*string && (unsigned char)*string < 0xA0) *column += 2; -#ifndef ENABLE_UTF8 - else if (0x7F < (unsigned char)*string && (unsigned char)*string < 0xA0) - *column += 2; -#endif else *column += 1;