tweaks: avoid parsing a multibyte character twice

The number of bytes in the character were determined twice: first in mbwidth() and then in char_length(). Do it just once, in mbtowide(). Also, avoid calling is_cntrl_char(), because it does unneeded checks when we already know that the high bit is set. This duplicates some code, but advance_over() is called a lot, so it is important that it is as fast as possible. This shouldn't slow down plain ASCII, as the extra checks (use_utf8 and *string < 0xA0) are done only for non-ASCII (apart from DEL).
2021-04-09 10:52:29 +02:00 · 2021-04-09 10:52:29 +02:00 · 78f92e044a
parent f11931a0dd
commit 78f92e044a
1 changed files with 19 additions and 10 deletions
--- a/src/chars.c
+++ b/src/chars.c
@ -334,13 +334,26 @@ int collect_char(const char *string, char *thechar)
 int advance_over(const char *string, size_t *column)
 {
 #ifdef ENABLE_UTF8
-	if ((signed char)*string < 0) {
-		if (is_cntrl_char(string))
+	if ((signed char)*string < 0 && use_utf8) {
+		/* A UTF-8 upper control code has two bytes and takes two columns. */
+		if (((unsigned char)string[0] == 0xC2 && (signed char)string[1] < -96)) {
 			*column += 2;
-		else
-			*column += mbwidth(string);
+			return 2;
+		} else {
+			wchar_t wc;
+			int charlen = mbtowide(&wc, string);

-		return char_length(string);
+			if (charlen < 0) {
+				*column += 1;
+				return 1;
+			}
+
+			int width = wcwidth(wc);
+
+			*column += (width < 0) ? 1 : width;
+
+			return charlen;
+		}
 	}
 #endif

@ -349,12 +362,8 @@ int advance_over(const char *string, size_t *column)
 			*column += tabsize - *column % tabsize;
 		else
 			*column += 2;
-	} else if (*string == 0x7F)
+	} else if (0x7E < (unsigned char)*string && (unsigned char)*string < 0xA0)
 		*column += 2;
-#ifndef ENABLE_UTF8
-	else if (0x7F < (unsigned char)*string && (unsigned char)*string < 0xA0)
-		*column += 2;
-#endif
 	else
 		*column += 1;