From 64aa8757a838960e8c2b028891be217ee5246c05 Mon Sep 17 00:00:00 2001 From: Benno Schulenberg Date: Thu, 26 Jan 2017 16:24:18 +0100 Subject: [PATCH] search: make the \b and \B anchors work correctly in both directions That is: remove the special treatment of BOW anchors, and instead make regexes match against the whole line instead of against an artificially shortened one, because the latter method creates ghost matches: matches at the starting point of the search that aren't really matches when seen in the context of the whole line. This fixes https://savannah.gnu.org/bugs/?50030. --- src/search.c | 26 ++---------------- src/utils.c | 75 ++++++++++++++++++++++++++++++++++------------------ 2 files changed, 52 insertions(+), 49 deletions(-) diff --git a/src/search.c b/src/search.c index c897e8ea..5d1532ae 100644 --- a/src/search.c +++ b/src/search.c @@ -38,8 +38,6 @@ static bool history_changed = FALSE; #ifdef HAVE_REGEX_H static bool regexp_compiled = FALSE; /* Have we compiled any regular expressions? */ -static bool bow_anchored = FALSE; - /* Whether a regex starts with a beginning-of-word anchor. */ /* Compile the given regular expression and store it in search_regexp. * Return TRUE if the expression is valid, and FALSE otherwise. */ @@ -62,10 +60,6 @@ bool regexp_init(const char *regexp) regexp_compiled = TRUE; - /* Remember whether the regex starts with a beginning-of-word anchor. */ - bow_anchored = (strncmp(regexp, "\\<", 2) == 0 || - strncmp(regexp, "\\b", 2) == 0); - return TRUE; } @@ -302,24 +296,8 @@ int findnextstr(const char *needle, bool whole_word_only, size_t *match_len, if (found != NULL) { #ifdef HAVE_REGEX_H /* When doing a regex search, compute the length of the match. */ - if (ISSET(USE_REGEXP)) { + if (ISSET(USE_REGEXP)) found_len = regmatches[0].rm_eo - regmatches[0].rm_so; - - /* If the regex starts with a BOW anchor, check that the found - * match actually is the start of a word. If not, continue. */ - if (bow_anchored && found != line->data) { - size_t before = move_mbleft(line->data, found - line->data); - - /* If a word char is before the match, skip this match. */ - if (is_word_mbchar(line->data + before, FALSE)) { - if (ISSET(BACKWARDS_SEARCH)) - from = line->data + before; - else - from = found + move_mbright(found, 0); - continue; - } - } - } #endif #ifndef DISABLE_SPELLER /* When we're spell checking, a match should be a separate word; @@ -531,7 +509,7 @@ int replace_regexp(char *string, bool create) * subexpression match to the new line. */ if (create) { strncpy(string, openfile->current->data + - openfile->current_x + regmatches[num].rm_so, i); + regmatches[num].rm_so, i); string += i; } } diff --git a/src/utils.c b/src/utils.c index 74a18fa6..4d018f81 100644 --- a/src/utils.c +++ b/src/utils.c @@ -315,41 +315,66 @@ bool is_separate_word(size_t position, size_t length, const char *buf) } #endif /* !DISABLE_SPELLER */ -/* If we are searching backwards, we will find the last match that - * starts no later than start. Otherwise we find the first match - * starting no earlier than start. If we are doing a regexp search, we - * fill in the global variable regmatches with at most 9 subexpression - * matches. Also, all .rm_so elements are relative to the start of the - * whole match, so regmatches[0].rm_so == 0. */ +/* Return the position of the needle in the haystack, or NULL if not found. + * When searching backwards, we will find the last match that starts no later + * than the given start; otherwise, we find the first match starting no earlier + * than start. If we are doing a regexp search, and we find a match, we fill + * in the global variable regmatches with at most 9 subexpression matches. */ const char *strstrwrapper(const char *haystack, const char *needle, const char *start) { #ifdef HAVE_REGEX_H if (ISSET(USE_REGEXP)) { if (ISSET(BACKWARDS_SEARCH)) { - if (regexec(&search_regexp, haystack, 1, regmatches, 0) == 0 && - haystack + regmatches[0].rm_so <= start) { - const char *retval = haystack + regmatches[0].rm_so; + size_t last_find, ceiling, far_end; + size_t floor = 0, next_rung = 0; + /* The start of the search range, and the next start. */ - /* Search forward until there are no more matches. */ - while (regexec(&search_regexp, retval + 1, 1, - regmatches, REG_NOTBOL) == 0 && - retval + regmatches[0].rm_so + 1 <= start) - retval += regmatches[0].rm_so + 1; - /* Finally, put the subexpression matches in global - * variable regmatches. The REG_NOTBOL flag doesn't - * matter now. */ - regexec(&search_regexp, retval, 10, regmatches, 0); - return retval; + if (regexec(&search_regexp, haystack, 1, regmatches, 0) != 0) + return NULL; + + far_end = strlen(haystack); + ceiling = start - haystack; + last_find = regmatches[0].rm_so; + + /* A result beyond the search range also means: no match. */ + if (last_find > ceiling) + return NULL; + + /* Move the start-of-range forward until there is no more match; + * then the last match found is the first match backwards. */ + while (regmatches[0].rm_so <= ceiling) { + floor = next_rung; + last_find = regmatches[0].rm_so; + /* If this is the last possible match, don't try to advance. */ + if (last_find == ceiling) + break; + next_rung = move_mbright(haystack, last_find); + regmatches[0].rm_so = next_rung; + regmatches[0].rm_eo = far_end; + if (regexec(&search_regexp, haystack, 1, regmatches, + REG_STARTEND) != 0) + break; } - } else if (regexec(&search_regexp, start, 10, regmatches, - (start > haystack) ? REG_NOTBOL : 0) == 0) { - const char *retval = start + regmatches[0].rm_so; - regexec(&search_regexp, retval, 10, regmatches, 0); - return retval; + /* Find the last match again, to get possible submatches. */ + regmatches[0].rm_so = floor; + regmatches[0].rm_eo = far_end; + if (regexec(&search_regexp, haystack, 10, regmatches, + REG_STARTEND) != 0) + statusline(ALERT, "BAD: failed to refind the match!"); + + return haystack + regmatches[0].rm_so; } - return NULL; + + /* Do a forward regex search from the starting point. */ + regmatches[0].rm_so = start - haystack; + regmatches[0].rm_eo = strlen(haystack); + if (regexec(&search_regexp, haystack, 10, regmatches, + REG_STARTEND) != 0) + return NULL; + else + return haystack + regmatches[0].rm_so; } #endif /* HAVE_REGEX_H */ if (ISSET(CASE_SENSITIVE)) {