add better handling of invalid Unicode, plus a few miscellaneous minor

fixes git-svn-id: svn://svn.savannah.gnu.org/nano/trunk/nano@2973 35c25a1d-7b9e-4130-9fde-d3aeb78583b8
2005-08-04 20:24:26 +00:00 · 2005-08-04 20:24:26 +00:00 · 8c55d21bd6
parent b7bf27a509
commit 8c55d21bd6
3 changed files with 41 additions and 28 deletions
--- a/17
+++ b/17
@ -137,9 +137,15 @@ CVS code -
 - color.c:
 	- Remove unneeded fcntl.h include. (DLR)
 - chars.c:
+  control_rep(), control_mbrep()
+	- Assert that the multibyte character passed in is a control
+	  character if it's valid. (DLR)
  mbrep()
 	- New function, the equivalent of control_mbrep() for non-control
 	  characters. (DLR)
+	- Treat the Unicode characters D800-DFFF and FFFE-FFFF as
+	  invalid, since the C library's multibyte functions don't seem
+	  to. (DLR)
  parse_mbchar()
 	- Remove now-unneeded bad_chr parameter. (DLR)
  mbstrchr()
@ -263,10 +269,13 @@ CVS code -
 	  as wc does. (DLR)
 - winio.c:
  get_word_kbinput()
-	- Don't allow the input word to be between hexadecimal D800 to
-	  DFFF or hexadecimal FFFE to FFFD, as they are invalid Unicode
-	  characters; rename variables word and word_digits to uni and
-	  uni_digits; and rename to get_unicode_kbinput(). (DLR)
+	- Multiply the entered digits by hexadecimal numbers instead of
+	  decimal numbers for clarity, rename to get_unicode_kbinput(),
+	  and rename variables word and word_digits to uni and
+	  uni_digits. (DLR)
+  parse_verbatim_kbinput()
+	- Rename variables word_mb and word_mb_len to uni_mb and
+	  uni_mb_len. (DLR)
  display_string()
 	- Instead of using parse_mbchar()'s bad_chr parameter, use
 	  mbrep() to get the representation of a bad character. (DLR)
--- a/src/chars.c
+++ b/src/chars.c
@ -184,6 +184,8 @@ bool is_word_mbchar(const char *c, bool allow_punct)
 * is (c + 64).  We return that character. */
 char control_rep(char c)
 {
+    assert(is_cntrl_char(c));
+
    /* Treat newlines embedded in a line as encoded nulls. */
    if (c == '\n')
 	return '@';
@ -198,6 +200,8 @@ char control_rep(char c)
 * where ch is (c + 64).  We return that wide character. */
 wchar_t control_wrep(wchar_t wc)
 {
+    assert(is_cntrl_wchar(wc));
+
    /* Treat newlines embedded in a line as encoded nulls. */
    if (wc == '\n')
 	return '@';
@ -251,7 +255,10 @@ char *mbrep(const char *c, char *crep, int *crep_len)
    if (ISSET(USE_UTF8)) {
 	wchar_t wc;

-	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
+	/* Unicode D800-DFFF and FFFE-FFFF are invalid, even though
+	 * they're parsed properly. */
+	if (mbtowc(&wc, c, MB_CUR_MAX) < 0 || ((0xD800 <= wc && wc <=
+		0xDFFF) || (0XFFFE <= wc && wc <= 0xFFFF))) {
 	    mbtowc(NULL, NULL, 0);
 	    crep = (char *)bad_mbchar;
 	    *crep_len = bad_mbchar_len;
--- a/src/winio.c
+++ b/src/winio.c
@ -1232,8 +1232,8 @@ int get_byte_kbinput(int kbinput
 }

 /* Translate a Unicode sequence: turn a four-digit hexadecimal number
- * from 0000 to D7FF or E000 to FFFD (case-insensitive) into its
- * corresponding multibyte value. */
+ * from 0000 to FFFF(case-insensitive) into its corresponding multibyte
+ * value. */
 int get_unicode_kbinput(int kbinput
 #ifndef NANO_SMALL
 	, bool reset
@ -1273,11 +1273,9 @@ int get_unicode_kbinput(int kbinput
 	case 2:
 	    /* Two digits: add the digit we got to the 0x100's position
 	     * of the Unicode sequence holder. */
-	    if (('0' <= kbinput && kbinput <= '7') || (uni != 0xD000 &&
-		'8' <= kbinput && kbinput <= '9'))
+	    if ('0' <= kbinput && kbinput <= '9')
 		uni += (kbinput - '0') * 0x100;
-	    else if (uni != 0xd000 && 'a' <= tolower(kbinput) &&
-		tolower(kbinput) <= 'f')
+	    else if ('a' <= tolower(kbinput) && tolower(kbinput) <= 'f')
 		uni += (tolower(kbinput) + 10 - 'a') * 0x100;
 	    else
 		/* If the character we got isn't a hexadecimal digit, or
@ -1305,9 +1303,8 @@ int get_unicode_kbinput(int kbinput
 	    if ('0' <= kbinput && kbinput <= '9') {
 		uni += (kbinput - '0');
 		retval = uni;
-	    } else if (('a' <= tolower(kbinput) &&
-		tolower(kbinput) <= 'd') || (uni != 0xFFF0 && 'e' <=
-		tolower(kbinput) && tolower(kbinput) <= 'f')) {
+	    } else if ('a' <= tolower(kbinput) && tolower(kbinput) <=
+		'f') {
 		uni += (tolower(kbinput) + 10 - 'a');
 		retval = uni;
 	    } else
@ -1418,13 +1415,13 @@ int *get_verbatim_kbinput(WINDOW *win, size_t *kbinput_len)
 * that, leave the input as-is. */ 
 int *parse_verbatim_kbinput(WINDOW *win, size_t *kbinput_len)
 {
-    int *kbinput, word, *retval;
+    int *kbinput, uni, *retval;

    /* Read in the first keystroke. */
    while ((kbinput = get_input(win, 1)) == NULL);

    /* Check whether the first keystroke is a hexadecimal digit. */
-    word = get_unicode_kbinput(*kbinput
+    uni = get_unicode_kbinput(*kbinput
 #ifndef NANO_SMALL
 	, FALSE
 #endif
@ -1432,36 +1429,36 @@ int *parse_verbatim_kbinput(WINDOW *win, size_t *kbinput_len)

    /* If the first keystroke isn't a hexadecimal digit, put back the
     * first keystroke. */
-    if (word != ERR)
+    if (uni != ERR)
 	unget_input(kbinput, 1);
    /* Otherwise, read in keystrokes until we have a complete word
     * sequence, and put back the corresponding word value. */
    else {
-	char *word_mb;
-	int word_mb_len, *seq, i;
+	char *uni_mb;
+	int uni_mb_len, *seq, i;

-	while (word == ERR) {
+	while (uni == ERR) {
 	    while ((kbinput = get_input(win, 1)) == NULL);

-	    word = get_unicode_kbinput(*kbinput
+	    uni = get_unicode_kbinput(*kbinput
 #ifndef NANO_SMALL
 		, FALSE
 #endif
 		);
 	}

-	/* Put back the multibyte equivalent of the word value. */
-	word_mb = make_mbchar(word, &word_mb_len);
+	/* Put back the multibyte equivalent of the Unicode value. */
+	uni_mb = make_mbchar(uni, &uni_mb_len);

-	seq = (int *)nmalloc(word_mb_len * sizeof(int));
+	seq = (int *)nmalloc(uni_mb_len * sizeof(int));

-	for (i = 0; i < word_mb_len; i++)
-	    seq[i] = (unsigned char)word_mb[i];
+	for (i = 0; i < uni_mb_len; i++)
+	    seq[i] = (unsigned char)uni_mb[i];

-	unget_input(seq, word_mb_len);
+	unget_input(seq, uni_mb_len);

 	free(seq);
-	free(word_mb);
+	free(uni_mb);
    }

    /* Get the complete sequence, and save the characters in it as the