handle invalid multibyte characters more efficiently

git-svn-id: svn://svn.savannah.gnu.org/nano/trunk/nano@2941 35c25a1d-7b9e-4130-9fde-d3aeb78583b8
master
David Lawrence Ramsey 2005-07-26 06:13:45 +00:00
parent d4defccded
commit 96452cb60c
10 changed files with 112 additions and 107 deletions

View File

@ -118,6 +118,11 @@ CVS code -
- color.c:
- Remove unneeded string.h and fcntl.h includes. (DLR)
- chars.c:
mbrep()
- New function, the equivalent of control_mbrep() for non-control
characters. (DLR)
parse_mbchar()
- Remove now-unneeded bad_chr parameter. (DLR)
mbstrchr()
- Don't count matches between valid and invalid multibyte
sequences anymore, for consistency. (DLR)
@ -200,9 +205,6 @@ CVS code -
(DLR)
- Move stdlib.h, dirent.h, regex.h, and assert.h includes here,
as every source file needs them. (DLR)
proto.h:
- Add declarations for bad_mbchar and bad_mbchar_len, so that we
can use them in display_string() as well as chars.c. (DLR)
- rcfile.c:
nregcomp()
- Return TRUE when the compilation succeeds and FALSE otherwise,
@ -237,6 +239,9 @@ CVS code -
the number of lines and characters in the file or selection,
as wc does. (DLR)
- winio.c:
display_string()
- Instead of using parse_mbchar()'s bad_chr parameter, use
mbrep() to get the representation of a bad character. (DLR)
edit_redraw(), edit_refresh()
- Clean up and simplify. (DLR)
edit_update()

View File

@ -41,8 +41,8 @@ static const wchar_t bad_wchar = 0xFFFD;
* Unicode FFFD (Replacement Character), unless we're
* determining if it's a control character or searching for a
* match to it. */
const char *bad_mbchar = "\xEF\xBF\xBD";
const int bad_mbchar_len = 3;
static const char *bad_mbchar = "\xEF\xBF\xBD";
static const int bad_mbchar_len = 3;
#endif
#ifndef HAVE_ISBLANK
@ -241,6 +241,39 @@ char *control_mbrep(const char *c, char *crep, int *crep_len)
return crep;
}
/* c is a multibyte non-control character. We return that multibyte
* character. */
char *mbrep(const char *c, char *crep, int *crep_len)
{
assert(c != NULL && crep != NULL && crep_len != NULL);
#ifdef ENABLE_UTF8
if (ISSET(USE_UTF8)) {
wchar_t wc;
if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
mbtowc(NULL, NULL, 0);
crep = (char *)bad_mbchar;
*crep_len = bad_mbchar_len;
} else {
*crep_len = wctomb(crep, wc);
if (*crep_len < 0) {
wctomb(NULL, 0);
*crep_len = 0;
}
}
} else {
#endif
*crep_len = 1;
*crep = *c;
#ifdef ENABLE_UTF8
}
#endif
return crep;
}
/* This function is equivalent to wcwidth() for multibyte characters. */
int mbwidth(const char *c)
{
@ -310,19 +343,14 @@ char *make_mbchar(int chr, int *chr_mb_len)
/* Parse a multibyte character from buf. Return the number of bytes
* used. If chr isn't NULL, store the multibyte character in it. If
* bad_chr isn't NULL, set it to TRUE if we have a bad multibyte
* character. If col isn't NULL, store the new display width in it. If
* *str is '\t', we expect col to have the current display width. */
int parse_mbchar(const char *buf, char *chr, bool *bad_chr, size_t
*col)
* col isn't NULL, store the new display width in it. If *buf is '\t',
* we expect col to have the current display width. */
int parse_mbchar(const char *buf, char *chr, size_t *col)
{
int buf_mb_len;
assert(buf != NULL);
if (bad_chr != NULL)
*bad_chr = FALSE;
#ifdef ENABLE_UTF8
if (ISSET(USE_UTF8)) {
/* Get the number of bytes in the multibyte character. */
@ -332,8 +360,6 @@ int parse_mbchar(const char *buf, char *chr, bool *bad_chr, size_t
* to TRUE and interpret buf's first byte. */
if (buf_mb_len < 0) {
mblen(NULL, 0);
if (bad_chr != NULL)
*bad_chr = TRUE;
buf_mb_len = 1;
} else if (buf_mb_len == 0)
buf_mb_len++;
@ -415,8 +441,7 @@ size_t move_mbleft(const char *buf, size_t pos)
/* There is no library function to move backward one multibyte
* character. Here is the naive, O(pos) way to do it. */
while (TRUE) {
int buf_mb_len = parse_mbchar(buf + pos - pos_prev, NULL, NULL,
NULL);
int buf_mb_len = parse_mbchar(buf + pos - pos_prev, NULL, NULL);
if (pos_prev <= (size_t)buf_mb_len)
break;
@ -431,7 +456,7 @@ size_t move_mbleft(const char *buf, size_t pos)
* after the one at pos. */
size_t move_mbright(const char *buf, size_t pos)
{
return pos + parse_mbchar(buf + pos, NULL, NULL, NULL);
return pos + parse_mbchar(buf + pos, NULL, NULL);
}
#ifndef HAVE_STRCASECMP
@ -482,7 +507,7 @@ int mbstrncasecmp(const char *s1, const char *s2, size_t n)
bool bad_s1_mb = FALSE, bad_s2_mb = FALSE;
int s1_mb_len, s2_mb_len;
s1_mb_len = parse_mbchar(s1, s1_mb, NULL, NULL);
s1_mb_len = parse_mbchar(s1, s1_mb, NULL);
if (mbtowc(&ws1, s1_mb, s1_mb_len) < 0) {
mbtowc(NULL, NULL, 0);
@ -490,7 +515,7 @@ int mbstrncasecmp(const char *s1, const char *s2, size_t n)
bad_s1_mb = TRUE;
}
s2_mb_len = parse_mbchar(s2, s2_mb, NULL, NULL);
s2_mb_len = parse_mbchar(s2, s2_mb, NULL);
if (mbtowc(&ws2, s2_mb, s2_mb_len) < 0) {
mbtowc(NULL, NULL, 0);
@ -556,7 +581,7 @@ const char *mbstrcasestr(const char *haystack, const char *needle)
while (*q != '\0') {
bool bad_r_mb = FALSE, bad_q_mb = FALSE;
r_mb_len = parse_mbchar(r, r_mb, NULL, NULL);
r_mb_len = parse_mbchar(r, r_mb, NULL);
if (mbtowc(&wr, r_mb, r_mb_len) < 0) {
mbtowc(NULL, NULL, 0);
@ -564,7 +589,7 @@ const char *mbstrcasestr(const char *haystack, const char *needle)
bad_r_mb = TRUE;
}
q_mb_len = parse_mbchar(q, q_mb, NULL, NULL);
q_mb_len = parse_mbchar(q, q_mb, NULL);
if (mbtowc(&wq, q_mb, q_mb_len) < 0) {
mbtowc(NULL, NULL, 0);
@ -662,7 +687,7 @@ const char *mbrevstrcasestr(const char *haystack, const char *needle,
while (*q != '\0') {
bool bad_r_mb = FALSE, bad_q_mb = FALSE;
r_mb_len = parse_mbchar(r, r_mb, NULL, NULL);
r_mb_len = parse_mbchar(r, r_mb, NULL);
if (mbtowc(&wr, r_mb, r_mb_len) < 0) {
mbtowc(NULL, NULL, 0);
@ -670,7 +695,7 @@ const char *mbrevstrcasestr(const char *haystack, const char *needle,
bad_r_mb = TRUE;
}
q_mb_len = parse_mbchar(q, q_mb, NULL, NULL);
q_mb_len = parse_mbchar(q, q_mb, NULL);
if (mbtowc(&wq, q_mb, q_mb_len) < 0) {
mbtowc(NULL, NULL, 0);
@ -740,7 +765,7 @@ size_t mbstrnlen(const char *s, size_t maxlen)
int s_mb_len;
while (*s != '\0') {
s_mb_len = parse_mbchar(s, NULL, NULL, NULL);
s_mb_len = parse_mbchar(s, NULL, NULL);
if (maxlen == 0)
break;
@ -777,7 +802,7 @@ char *mbstrchr(const char *s, char *c)
}
while (*s != '\0') {
int s_mb_len = parse_mbchar(s, s_mb, NULL, NULL);
int s_mb_len = parse_mbchar(s, s_mb, NULL);
if (mbtowc(&ws, s_mb, s_mb_len) < 0) {
mbtowc(NULL, NULL, 0);
@ -832,7 +857,7 @@ bool has_blank_mbchars(const char *s)
while (*s != '\0') {
int chr_mb_len;
chr_mb_len = parse_mbchar(s, chr_mb, NULL, NULL);
chr_mb_len = parse_mbchar(s, chr_mb, NULL);
if (is_blank_mbchar(chr_mb)) {
retval = TRUE;

View File

@ -2033,9 +2033,9 @@ char *input_tab(char *buf, size_t *place, bool *lastwastab, bool *list)
/* Get the number of single-byte characters that all the
* matches have in common. */
match1_mb_len = parse_mbchar(matches[0] + common_len,
match1_mb, NULL, NULL);
match1_mb, NULL);
match2_mb_len = parse_mbchar(matches[match] +
common_len, match2_mb, NULL, NULL);
common_len, match2_mb, NULL);
match1_mb[match1_mb_len] = '\0';
match2_mb[match2_mb_len] = '\0';
if (strcmp(match1_mb, match2_mb) != 0)
@ -2045,8 +2045,7 @@ char *input_tab(char *buf, size_t *place, bool *lastwastab, bool *list)
if (match < num_matches || matches[0][common_len] == '\0')
break;
common_len += parse_mbchar(buf + common_len, NULL, NULL,
NULL);
common_len += parse_mbchar(buf + common_len, NULL, NULL);
}
free(match1_mb);

View File

@ -227,7 +227,7 @@ bool do_next_word(bool allow_punct, bool allow_update)
* the current word. */
while (!end_line) {
char_mb_len = parse_mbchar(openfile->current->data +
openfile->current_x, char_mb, NULL, NULL);
openfile->current_x, char_mb, NULL);
/* If we've found it, stop moving forward through the current
* line. */
@ -254,7 +254,7 @@ bool do_next_word(bool allow_punct, bool allow_update)
openfile->current = openfile->current->next) {
while (!end_line) {
char_mb_len = parse_mbchar(openfile->current->data +
openfile->current_x, char_mb, NULL, NULL);
openfile->current_x, char_mb, NULL);
/* If we've found it, stop moving forward through the
* current line. */
@ -322,7 +322,7 @@ bool do_prev_word(bool allow_punct, bool allow_update)
* of the current word. */
while (!begin_line) {
char_mb_len = parse_mbchar(openfile->current->data +
openfile->current_x, char_mb, NULL, NULL);
openfile->current_x, char_mb, NULL);
/* If we've found it, stop moving backward through the current
* line. */
@ -352,7 +352,7 @@ bool do_prev_word(bool allow_punct, bool allow_update)
openfile->current = openfile->current->prev) {
while (!begin_line) {
char_mb_len = parse_mbchar(openfile->current->data +
openfile->current_x, char_mb, NULL, NULL);
openfile->current_x, char_mb, NULL);
/* If we've found it, stop moving backward through the
* current line. */
@ -392,9 +392,8 @@ bool do_prev_word(bool allow_punct, bool allow_update)
openfile->current_x);
while (!begin_line) {
char_mb_len =
parse_mbchar(openfile->current->data +
openfile->current_x, char_mb, NULL, NULL);
char_mb_len = parse_mbchar(openfile->current->data +
openfile->current_x, char_mb, NULL);
/* If we've found it, stop moving backward through the
* current line. */

View File

@ -1778,10 +1778,8 @@ void do_output(char *output, size_t output_len, bool allow_cntrls)
}
}
/* Interpret the next multibyte character. If it's an invalid
* multibyte character, interpret it as though it's a byte
* character. */
char_buf_len = parse_mbchar(output + i, char_buf, NULL, NULL);
/* Interpret the next multibyte character. */
char_buf_len = parse_mbchar(output + i, char_buf, NULL);
i += char_buf_len;

View File

@ -132,11 +132,6 @@ extern bool curses_ended;
extern char *homedir;
#ifdef ENABLE_UTF8
extern const char *bad_mbchar;
extern const int bad_mbchar_len;
#endif
/* The functions we want available. */
/* Public functions in chars.c. */
@ -161,11 +156,11 @@ char control_rep(char c);
wchar_t control_wrep(wchar_t c);
#endif
char *control_mbrep(const char *c, char *crep, int *crep_len);
char *mbrep(const char *c, char *crep, int *crep_len);
int mbwidth(const char *c);
int mb_cur_max(void);
char *make_mbchar(int chr, int *chr_mb_len);
int parse_mbchar(const char *buf, char *chr, bool *bad_chr, size_t
*col);
int parse_mbchar(const char *buf, char *chr, size_t *col);
size_t move_mbleft(const char *buf, size_t pos);
size_t move_mbright(const char *buf, size_t pos);
#ifndef HAVE_STRCASECMP

View File

@ -625,11 +625,10 @@ void parse_rcfile(FILE *rcstream)
} else {
whitespace_len[0] =
parse_mbchar(whitespace, NULL,
NULL, NULL);
NULL);
whitespace_len[1] =
parse_mbchar(whitespace +
whitespace_len[0], NULL,
NULL, NULL);
whitespace_len[0], NULL, NULL);
}
} else
#endif

View File

@ -76,7 +76,7 @@ void do_delete(void)
if (openfile->current->data[openfile->current_x] != '\0') {
int char_buf_len = parse_mbchar(openfile->current->data +
openfile->current_x, NULL, NULL, NULL);
openfile->current_x, NULL, NULL);
size_t line_len = strlen(openfile->current->data +
openfile->current_x);
@ -576,7 +576,7 @@ ssize_t break_line(const char *line, ssize_t goal, bool newline)
while (*line != '\0' && goal >= 0) {
size_t pos = 0;
line_len = parse_mbchar(line, NULL, NULL, &pos);
line_len = parse_mbchar(line, NULL, &pos);
if (is_blank_mbchar(line) || (newline && *line == '\n')) {
blank_loc = cur_loc;
@ -599,7 +599,7 @@ ssize_t break_line(const char *line, ssize_t goal, bool newline)
bool found_blank = FALSE;
while (*line != '\0') {
line_len = parse_mbchar(line, NULL, NULL, NULL);
line_len = parse_mbchar(line, NULL, NULL);
if (is_blank_mbchar(line) || (newline && *line == '\n')) {
if (!found_blank)
@ -617,12 +617,12 @@ ssize_t break_line(const char *line, ssize_t goal, bool newline)
/* Move to the last blank after blank_loc, if there is one. */
line -= cur_loc;
line += blank_loc;
line_len = parse_mbchar(line, NULL, NULL, NULL);
line_len = parse_mbchar(line, NULL, NULL);
line += line_len;
while (*line != '\0' && (is_blank_mbchar(line) ||
(newline && *line == '\n'))) {
line_len = parse_mbchar(line, NULL, NULL, NULL);
line_len = parse_mbchar(line, NULL, NULL);
line += line_len;
blank_loc += line_len;
@ -646,7 +646,7 @@ size_t indent_length(const char *line)
blank_mb = charalloc(mb_cur_max());
while (*line != '\0') {
blank_mb_len = parse_mbchar(line, blank_mb, NULL, NULL);
blank_mb_len = parse_mbchar(line, blank_mb, NULL);
if (!is_blank_mbchar(blank_mb))
break;
@ -697,14 +697,14 @@ void justify_format(filestruct *paragraph, size_t skip)
/* If this character is blank, make sure that it's a space with
* no blanks after it. */
if (is_blank_mbchar(end)) {
end_len = parse_mbchar(end, NULL, NULL, NULL);
end_len = parse_mbchar(end, NULL, NULL);
*new_end = ' ';
new_end++;
end += end_len;
while (*end != '\0' && is_blank_mbchar(end)) {
end_len = parse_mbchar(end, NULL, NULL, NULL);
end_len = parse_mbchar(end, NULL, NULL);
end += end_len;
shift += end_len;
@ -722,7 +722,7 @@ void justify_format(filestruct *paragraph, size_t skip)
* more than two blanks after it, and make sure that the blanks
* are spaces. */
} else if (mbstrchr(punct, end) != NULL) {
end_len = parse_mbchar(end, NULL, NULL, NULL);
end_len = parse_mbchar(end, NULL, NULL);
while (end_len > 0) {
*new_end = *end;
@ -732,7 +732,7 @@ void justify_format(filestruct *paragraph, size_t skip)
}
if (*end != '\0' && mbstrchr(brackets, end) != NULL) {
end_len = parse_mbchar(end, NULL, NULL, NULL);
end_len = parse_mbchar(end, NULL, NULL);
while (end_len > 0) {
*new_end = *end;
@ -743,7 +743,7 @@ void justify_format(filestruct *paragraph, size_t skip)
}
if (*end != '\0' && is_blank_mbchar(end)) {
end_len = parse_mbchar(end, NULL, NULL, NULL);
end_len = parse_mbchar(end, NULL, NULL);
*new_end = ' ';
new_end++;
@ -751,7 +751,7 @@ void justify_format(filestruct *paragraph, size_t skip)
}
if (*end != '\0' && is_blank_mbchar(end)) {
end_len = parse_mbchar(end, NULL, NULL, NULL);
end_len = parse_mbchar(end, NULL, NULL);
*new_end = ' ';
new_end++;
@ -759,7 +759,7 @@ void justify_format(filestruct *paragraph, size_t skip)
}
while (*end != '\0' && is_blank_mbchar(end)) {
end_len = parse_mbchar(end, NULL, NULL, NULL);
end_len = parse_mbchar(end, NULL, NULL);
end += end_len;
shift += end_len;
@ -775,7 +775,7 @@ void justify_format(filestruct *paragraph, size_t skip)
/* If this character is neither blank nor punctuation, leave it
* alone. */
} else {
end_len = parse_mbchar(end, NULL, NULL, NULL);
end_len = parse_mbchar(end, NULL, NULL);
while (end_len > 0) {
*new_end = *end;

View File

@ -247,8 +247,8 @@ bool is_whole_word(size_t pos, const char *buf, const char *word)
assert(buf != NULL && pos <= strlen(buf) && word != NULL);
parse_mbchar(buf + move_mbleft(buf, pos), p, NULL, NULL);
parse_mbchar(buf + word_end, r, NULL, NULL);
parse_mbchar(buf + move_mbleft(buf, pos), p, NULL);
parse_mbchar(buf + word_end, r, NULL);
/* If we're at the beginning of the line or the character before the
* word isn't a non-punctuation "word" character, and if we're at

View File

@ -1862,10 +1862,8 @@ void do_statusbar_output(char *output, size_t output_len, bool
}
}
/* Interpret the next multibyte character. If it's an invalid
* multibyte character, interpret it as though it's a byte
* character. */
char_buf_len = parse_mbchar(output + i, char_buf, NULL, NULL);
/* Interpret the next multibyte character. */
char_buf_len = parse_mbchar(output + i, char_buf, NULL);
i += char_buf_len;
@ -1935,7 +1933,7 @@ void do_statusbar_delete(void)
{
if (answer[statusbar_x] != '\0') {
int char_buf_len = parse_mbchar(answer + statusbar_x, NULL,
NULL, NULL);
NULL);
size_t line_len = strlen(answer + statusbar_x);
assert(statusbar_x < strlen(answer));
@ -1982,8 +1980,7 @@ bool do_statusbar_next_word(bool allow_punct)
/* Move forward until we find the character after the last letter of
* the current word. */
while (!end_line) {
char_mb_len = parse_mbchar(answer + statusbar_x, char_mb, NULL,
NULL);
char_mb_len = parse_mbchar(answer + statusbar_x, char_mb, NULL);
/* If we've found it, stop moving forward through the current
* line. */
@ -2007,8 +2004,7 @@ bool do_statusbar_next_word(bool allow_punct)
statusbar_x += char_mb_len;
while (!end_line) {
char_mb_len = parse_mbchar(answer + statusbar_x, char_mb, NULL,
NULL);
char_mb_len = parse_mbchar(answer + statusbar_x, char_mb, NULL);
/* If we've found it, stop moving forward through the current
* line. */
@ -2043,8 +2039,7 @@ bool do_statusbar_prev_word(bool allow_punct)
/* Move backward until we find the character before the first letter
* of the current word. */
while (!begin_line) {
char_mb_len = parse_mbchar(answer + statusbar_x, char_mb, NULL,
NULL);
char_mb_len = parse_mbchar(answer + statusbar_x, char_mb, NULL);
/* If we've found it, stop moving backward through the current
* line. */
@ -2069,8 +2064,7 @@ bool do_statusbar_prev_word(bool allow_punct)
statusbar_x = move_mbleft(answer, statusbar_x);
while (!begin_line) {
char_mb_len = parse_mbchar(answer + statusbar_x, char_mb, NULL,
NULL);
char_mb_len = parse_mbchar(answer + statusbar_x, char_mb, NULL);
/* If we've found it, stop moving backward through the current
* line. */
@ -2093,7 +2087,7 @@ bool do_statusbar_prev_word(bool allow_punct)
while (!begin_line) {
char_mb_len = parse_mbchar(answer + statusbar_x, char_mb,
NULL, NULL);
NULL);
/* If we've found it, stop moving backward through the
* current line. */
@ -2164,7 +2158,7 @@ size_t actual_x(const char *str, size_t xplus)
assert(str != NULL);
while (*str != '\0') {
int str_len = parse_mbchar(str, NULL, NULL, &length);
int str_len = parse_mbchar(str, NULL, &length);
if (length > xplus)
break;
@ -2189,7 +2183,7 @@ size_t strnlenpt(const char *str, size_t size)
assert(str != NULL);
while (*str != '\0') {
int str_len = parse_mbchar(str, NULL, NULL, &length);
int str_len = parse_mbchar(str, NULL, &length);
str += str_len;
@ -2281,8 +2275,6 @@ char *display_string(const char *buf, size_t start_col, size_t len, bool
/* The string we return. */
size_t index;
/* Current position in converted. */
bool bad_char;
/* Whether we have an invalid multibyte character. */
char *buf_mb = charalloc(mb_cur_max());
int buf_mb_len;
@ -2311,8 +2303,7 @@ char *display_string(const char *buf, size_t start_col, size_t len, bool
buf[start_index] != '\t')) {
/* We don't display all of buf[start_index] since it starts to
* the left of the screen. */
buf_mb_len = parse_mbchar(buf + start_index, buf_mb, NULL,
NULL);
buf_mb_len = parse_mbchar(buf + start_index, buf_mb, NULL);
if (is_cntrl_mbchar(buf_mb)) {
if (column < start_col) {
@ -2343,8 +2334,7 @@ char *display_string(const char *buf, size_t start_col, size_t len, bool
}
while (index < alloc_len - 1 && buf[start_index] != '\0') {
buf_mb_len = parse_mbchar(buf + start_index, buf_mb, &bad_char,
NULL);
buf_mb_len = parse_mbchar(buf + start_index, buf_mb, NULL);
/* If buf contains a tab character, interpret it. */
if (*buf_mb == '\t') {
@ -2394,27 +2384,22 @@ char *display_string(const char *buf, size_t start_col, size_t len, bool
#endif
converted[index++] = ' ';
start_col++;
/* If buf contains a non-control character, interpret it. */
/* If buf contains a non-control character, interpret it. If
* buf contains an invalid multibyte non-control character,
* display it as such. */
} else {
int i;
char *nctrl_buf_mb = charalloc(mb_cur_max());
int nctrl_buf_mb_len, i;
#ifdef ENABLE_UTF8
/* If buf contains an invalid multibyte non-control
* character, display it as such. */
if (ISSET(USE_UTF8) && bad_char) {
for (i = 0; i < bad_mbchar_len; i++)
converted[index++] = bad_mbchar[i];
nctrl_buf_mb = mbrep(buf_mb, nctrl_buf_mb,
&nctrl_buf_mb_len);
start_col += mbwidth(bad_mbchar);
} else {
#endif
for (i = 0; i < buf_mb_len; i++)
converted[index++] = buf[start_index + i];
for (i = 0; i < nctrl_buf_mb_len; i++)
converted[index++] = nctrl_buf_mb[i];
start_col += mbwidth(buf_mb);
#ifdef ENABLE_UTF8
}
#endif
start_col += mbwidth(nctrl_buf_mb);
free(nctrl_buf_mb);
}
start_index += buf_mb_len;