aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRobin Haberkorn <robin.haberkorn@googlemail.com>2024-10-04 23:41:16 +0400
committerRobin Haberkorn <robin.haberkorn@googlemail.com>2024-10-04 23:41:16 +0400
commitb36ff2502ae3b0e18fa862a01fba9cc2c9067e31 (patch)
treee4ffef55d060b77706b65ca48c6c0ae62a57e89e
parent024d26ac0cd869826801889f1299df34676fdf57 (diff)
downloadsciteco-b36ff2502ae3b0e18fa862a01fba9cc2c9067e31.tar.gz
pattern match characters support ^Q/^R now as well
* makes it possible, albeit cumbersome, to escape pattern match characters * For instance, to search for ^Q, you now have to type S^Q^Q^Q^Q$. To search for ^E you have to type S^Q^Q^Q^E$. But the last character cannot be typed with carets currently (FIXME?). For pattern-only characters, two ^Q should be sufficient as in S^Q^Q^X$. * Perhaps it would be more elegant to abolish the difference between string building and pattern matching characters to avoid double quoting. But then all string building constructs like ^EQq should operate at the pattern level as well (ie. match the contents of register q verbatim instead of being interpreted as a pattern). TECOC and TECO-64 don't do that either. If we leave everything as it is, at least a new string building construct should be added for auto-quoting patterns (analoguous to ^EN and ^E@).
-rw-r--r--doc/sciteco.7.template8
-rw-r--r--src/parser.c1
-rw-r--r--src/search.c51
-rw-r--r--tests/testsuite.at8
4 files changed, 57 insertions, 11 deletions
diff --git a/doc/sciteco.7.template b/doc/sciteco.7.template
index ca93fa6..81deac0 100644
--- a/doc/sciteco.7.template
+++ b/doc/sciteco.7.template
@@ -1789,6 +1789,14 @@ The following pattern match constructs are supported for matching
one character in different character classes
(caret-notations refer to the corresponding control characters):
.TP
+.BI ^Q c
+.TQ
+.BI ^R c
+Escape character \fIc\fP.
+Since these are interpreted as string building characters as well,
+you may have to type two or three \fB^Q\fP in a row to escape a
+pattern match character.
+.TP
.SCITECO_TOPIC ^S ^EB
.B ^S
.TQ
diff --git a/src/parser.c b/src/parser.c
index 7ba9876..8cb26e7 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -603,6 +603,7 @@ teco_state_stringbuilding_ctle_input(teco_machine_stringbuilding_t *ctx, gunicha
case 'N': next = &teco_state_stringbuilding_ctle_n; break;
default:
if (ctx->result) {
+ /* also makes sure that search patterns can start with ^E */
gchar buf[1+6] = {TECO_CTL_KEY('E')};
gsize len = g_unichar_to_utf8(chr, buf+1);
teco_machine_stringbuilding_append(ctx, buf, 1+len);
diff --git a/src/search.c b/src/search.c
index ed3a00c..c9a2ba0 100644
--- a/src/search.c
+++ b/src/search.c
@@ -115,6 +115,8 @@ teco_state_search_initial(teco_machine_main_t *ctx, GError **error)
typedef enum {
TECO_SEARCH_STATE_START,
+ TECO_SEARCH_STATE_CTL,
+ TECO_SEARCH_STATE_ESCAPE,
TECO_SEARCH_STATE_NOT,
TECO_SEARCH_STATE_CTL_E,
TECO_SEARCH_STATE_ANYQ,
@@ -320,6 +322,18 @@ teco_pattern2regexp(teco_string_t *pattern, guint codepage, gboolean single_expr
do {
/*
+ * Previous character was caret.
+ * Make sure it is handled like a control character.
+ * This is necessary even though we have string building activated,
+ * to support constructs like ^Q^Q (typed with carets) in order to
+ * quote pattern matching characters.
+ */
+ if (state == TECO_SEARCH_STATE_CTL) {
+ *pattern->data = TECO_CTL_KEY(g_ascii_toupper(*pattern->data));
+ state = TECO_SEARCH_STATE_START;
+ }
+
+ /*
* First check whether it is a class.
* This will not treat individual characters
* as classes, so we do not convert them to regexp
@@ -347,21 +361,36 @@ teco_pattern2regexp(teco_string_t *pattern, guint codepage, gboolean single_expr
switch (state) {
case TECO_SEARCH_STATE_START:
switch (*pattern->data) {
- case TECO_CTL_KEY('X'): teco_string_append_c(&re, '.'); break;
- case TECO_CTL_KEY('N'): state = TECO_SEARCH_STATE_NOT; break;
- default: {
- gsize len = codepage == SC_CP_UTF8
- ? g_utf8_next_char(pattern->data) - pattern->data : 1;
- /* the allocation could theoretically be avoided by escaping char-wise */
- g_autofree gchar *escaped = g_regex_escape_string(pattern->data, len);
- teco_string_append(&re, escaped, strlen(escaped));
- pattern->data += len;
- pattern->len -= len;
+ case '^':
+ state = TECO_SEARCH_STATE_CTL;
+ break;
+ case TECO_CTL_KEY('Q'):
+ case TECO_CTL_KEY('R'):
+ state = TECO_SEARCH_STATE_ESCAPE;
+ break;
+ case TECO_CTL_KEY('X'):
+ teco_string_append_c(&re, '.');
+ break;
+ case TECO_CTL_KEY('N'):
+ state = TECO_SEARCH_STATE_NOT;
+ break;
+ default:
+ state = TECO_SEARCH_STATE_ESCAPE;
continue;
}
- }
break;
+ case TECO_SEARCH_STATE_ESCAPE: {
+ gsize len = codepage == SC_CP_UTF8
+ ? g_utf8_next_char(pattern->data) - pattern->data : 1;
+ /* the allocation could theoretically be avoided by escaping char-wise */
+ g_autofree gchar *escaped = g_regex_escape_string(pattern->data, len);
+ teco_string_append(&re, escaped, strlen(escaped));
+ pattern->data += len;
+ pattern->len -= len;
+ continue;
+ }
+
case TECO_SEARCH_STATE_NOT: {
state = TECO_SEARCH_STATE_START;
g_autofree gchar *temp = teco_class2regexp(&state, pattern, codepage, TRUE, error);
diff --git a/tests/testsuite.at b/tests/testsuite.at
index fc7de4f..33f1bf5 100644
--- a/tests/testsuite.at
+++ b/tests/testsuite.at
@@ -79,6 +79,14 @@ AT_CHECK([$SCITECO -e "[[a 23Ub ]]b Qb\"N(0/0)'"], 0, ignore, ignore)
AT_CHECK([$SCITECO -e "[[\$ @FG'..' ]]\$ :Q\$-1Q\$-^^r\"=(0/0)'"], 0, ignore, ignore)
AT_CLEANUP
+AT_SETUP([Searches])
+# FIXME: We cannot currently easily insert a single ASCII 5 (^E), as it must be followed
+# by a 2nd character. It can be quoted, but cannot be written as Caret+E.
+# You also cannot search for a single ASCII 5 using Caret+E.
+# 2 additional ^Q are translated to a single ^Q and interpreted at the search-pattern layer.
+AT_CHECK([$SCITECO -e "@I/^Q\05/ J @:S/^Q^Q^Q\05/\"F(0/0)'"], 0, ignore, ignore)
+AT_CLEANUP
+
AT_SETUP([Editing local registers in macro calls])
AT_CHECK([$SCITECO -e '@^Ua{@EQ.x//} :Ma @^U.x/FOO/'], 0, ignore, ignore)
AT_CHECK([$SCITECO -e '@^Ua{@EQ.x//} Ma @^U.x/FOO/'], 1, ignore, ignore)