aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorRobin Haberkorn <robin.haberkorn@googlemail.com>2016-11-01 06:58:18 +0100
committerRobin Haberkorn <robin.haberkorn@googlemail.com>2016-11-01 07:23:49 +0100
commit9f6cba5c0370aee2f9803abbc35ab7e67f57ee84 (patch)
treeb03485f177d6ff700aac7fc8ff1e7e9e23a61866 /src
parentb5e6f4c61b7b8e220fb3faa071e30b3dfc559f2f (diff)
downloadsciteco-9f6cba5c0370aee2f9803abbc35ab7e67f57ee84.tar.gz
globbing supports character classes now and ^EN string building construct to escape glob patterns
* globbing is fnmatch(3) compatible, now on every supported platform. * which means that escaping of glob patterns is possible now. ^ENq has been introduced to ease this task. * This finally allows you to pass unmodified filenames to EB. Previously it was impossible to open file names containing glob wildcards. * this was achieved by moving from GPattern to GRegex as the underlying implementation. * The glob pattern is converted to a regular expression before being compiled to a GRegex. This turned out to be trickier than anticipated (~140 lines of code) and has a runtime penalty of course (complexity is O(2*n) over the pattern length). It is IMHO still better than the alternatives, like importing external code from libiberty, which is potentially non-cross-platform. * Using GRegex also opens the potential of supporting brace "expansions" later in the form of glob pattern constructs (they won't actually expand but match alternatives). * is_glob_pattern() has been simplified and moved to Globber::is_pattern(). It makes sense to reuse the Globber class namespace instead of using plain functions for functions working on glob patterns. * The documentation has a new subsection on glob patterns now. * Testsuite extended with glob pattern test cases
Diffstat (limited to 'src')
-rw-r--r--src/cmdline.cpp2
-rw-r--r--src/glob.cpp203
-rw-r--r--src/glob.h31
-rw-r--r--src/parser.cpp15
-rw-r--r--src/ring.cpp7
5 files changed, 228 insertions, 30 deletions
diff --git a/src/cmdline.cpp b/src/cmdline.cpp
index 8c5bc42..19a3c66 100644
--- a/src/cmdline.cpp
+++ b/src/cmdline.cpp
@@ -619,7 +619,7 @@ filename_complete(const gchar *filename, gchar completed,
gchar *insert = NULL;
gsize prefix_len = 0;
- if (is_glob_pattern(filename))
+ if (Globber::is_pattern(filename))
return NULL;
filename_expanded = expand_path(filename);
diff --git a/src/glob.cpp b/src/glob.cpp
index 2806fbf..cb9633f 100644
--- a/src/glob.cpp
+++ b/src/glob.cpp
@@ -61,7 +61,7 @@ Globber::Globber(const gchar *pattern, GFileTest _test)
dir = g_dir_open(*dirname ? dirname : ".", 0, NULL);
/* if dirname does not exist, dir may be NULL */
- Globber::pattern = g_pattern_spec_new(pattern + dirname_len);
+ Globber::pattern = compile_pattern(pattern + dirname_len);
}
gchar *
@@ -75,7 +75,7 @@ Globber::next(void)
while ((basename = g_dir_read_name(dir))) {
gchar *filename;
- if (!g_pattern_match_string(pattern, basename))
+ if (!g_regex_match(pattern, basename, (GRegexMatchFlags)0, NULL))
continue;
/*
@@ -100,12 +100,196 @@ Globber::next(void)
Globber::~Globber()
{
if (pattern)
- g_pattern_spec_free(pattern);
+ g_regex_unref(pattern);
if (dir)
g_dir_close(dir);
g_free(dirname);
}
+gchar *
+Globber::escape_pattern(const gchar *pattern)
+{
+ gsize escaped_len = 1;
+ gchar *escaped, *pout;
+
+ /*
+ * NOTE: The exact size of the escaped string is easy to calculate
+ * in O(n) just like strlen(pattern), so we can just as well
+ * do that.
+ */
+ for (const gchar *pin = pattern; *pin; pin++) {
+ switch (*pin) {
+ case '*':
+ case '?':
+ case '[':
+ escaped_len += 3;
+ break;
+ default:
+ escaped_len++;
+ break;
+ }
+ }
+ pout = escaped = (gchar *)g_malloc(escaped_len);
+
+ while (*pattern) {
+ switch (*pattern) {
+ case '*':
+ case '?':
+ case '[':
+ *pout++ = '[';
+ *pout++ = *pattern;
+ *pout++ = ']';
+ break;
+ default:
+ *pout++ = *pattern;
+ break;
+ }
+
+ pattern++;
+ }
+ *pout = '\0';
+
+ return escaped;
+}
+
+/**
+ * Compile a fnmatch(3)-compatible glob pattern to
+ * a PCRE regular expression.
+ *
+ * There is GPattern, but it only supports the
+ * "*" and "?" wildcards which most importantly
+ * do not allow escaping.
+ *
+ * @param pattern The pattern to compile.
+ * @return A new compiled regular expression object.
+ * Always non-NULL. Unref after use.
+ */
+GRegex *
+Globber::compile_pattern(const gchar *pattern)
+{
+ gchar *pattern_regex, *pout;
+ GRegex *pattern_compiled;
+
+ enum {
+ STATE_WILDCARD,
+ STATE_CLASS_START,
+ STATE_CLASS_NEGATE,
+ STATE_CLASS
+ } state = STATE_WILDCARD;
+
+ /*
+ * NOTE: The conversion to regex needs at most two
+ * characters per input character and the regex pattern
+ * is required only temporarily, so we use a fixed size
+ * buffer avoiding reallocations but wasting a few bytes
+ * (determining the exact required space would be tricky).
+ * It is not allocated on the stack though since pattern
+ * might be arbitrary user input and we must avoid
+ * stack overflows at all costs.
+ */
+ pout = pattern_regex = (gchar *)g_malloc(strlen(pattern)*2 + 1 + 1);
+
+ while (*pattern) {
+ if (state == STATE_WILDCARD) {
+ /*
+ * Outside a character class/set.
+ */
+ switch (*pattern) {
+ case '*':
+ *pout++ = '.';
+ *pout++ = '*';
+ break;
+ case '?':
+ *pout++ = '.';
+ break;
+ case '[':
+ /*
+ * The special case of an unclosed character
+ * class is allowed in fnmatch(3) but invalid
+ * in PCRE, so we must check for it explicitly.
+ * FIXME: This is sort of inefficient...
+ */
+ if (strchr(pattern, ']')) {
+ state = STATE_CLASS_START;
+ *pout++ = '[';
+ break;
+ }
+ /* fall through */
+ default:
+ /*
+ * For simplicity, all non-alphanumeric
+ * characters are escaped since they could
+ * be PCRE magic characters.
+ * g_regex_escape_string() is inefficient.
+ * character anyway.
+ */
+ if (!g_ascii_isalnum(*pattern))
+ *pout++ = '\\';
+ *pout++ = *pattern;
+ break;
+ }
+ } else {
+ /*
+ * Within a character class/set.
+ */
+ switch (*pattern) {
+ case '!':
+ /*
+ * fnmatch(3) allows ! instead of ^ immediately
+ * after the opening bracket.
+ */
+ if (state > STATE_CLASS_START) {
+ state = STATE_CLASS;
+ *pout++ = '!';
+ break;
+ }
+ /* fall through */
+ case '^':
+ state = state == STATE_CLASS_START
+ ? STATE_CLASS_NEGATE : STATE_CLASS;
+ *pout++ = '^';
+ break;
+ case ']':
+ /*
+ * fnmatch(3) allows the closing bracket as the
+ * first character to include it in the set, while
+ * PCRE requires it to be escaped.
+ */
+ if (state == STATE_CLASS) {
+ state = STATE_WILDCARD;
+ *pout++ = ']';
+ break;
+ }
+ /* fall through */
+ default:
+ if (!g_ascii_isalnum(*pattern))
+ *pout++ = '\\';
+ /* fall through */
+ case '-':
+ state = STATE_CLASS;
+ *pout++ = *pattern;
+ break;
+ }
+ }
+
+ pattern++;
+ }
+ *pout++ = '$';
+ *pout = '\0';
+
+ pattern_compiled = g_regex_new(pattern_regex,
+ (GRegexCompileFlags)(G_REGEX_DOTALL | G_REGEX_ANCHORED),
+ (GRegexMatchFlags)0, NULL);
+ /*
+ * Since the regex is generated from patterns that are
+ * always valid, there must be no syntactic error.
+ */
+ g_assert(pattern_compiled != NULL);
+
+ g_free(pattern_regex);
+ return pattern_compiled;
+}
+
/*
* Command States
*/
@@ -116,10 +300,9 @@ Globber::~Globber()
*
* EN is a powerful command for performing various tasks
* given a glob \fIpattern\fP.
- * A \fIpattern\fP is a file name with \(lq*\(rq and
- * \(lq?\(rq wildcards:
- * \(lq*\(rq matches an arbitrary, possibly empty, string.
- * \(lq?\(rq matches an arbitrary character.
+ * For a description of the glob pattern syntax, refer to the section
+ * .B Glob Patterns
+ * for details.
*
* \fIpattern\fP may be omitted, in which case it defaults
* to the pattern saved in the search and glob register \(lq_\(rq.
@@ -291,7 +474,9 @@ StateGlob_filename::got_file(const gchar *filename)
/*
* Match pattern against provided file name
*/
- if (g_pattern_match_simple(pattern_str, filename) &&
+ GRegex *pattern = Globber::compile_pattern(pattern_str);
+
+ if (g_regex_match(pattern, filename, (GRegexMatchFlags)0, NULL) &&
(!teco_test_mode || g_file_test(filename, file_flags))) {
if (!colon_modified) {
interface.ssm(SCI_BEGINUNDOACTION);
@@ -304,6 +489,8 @@ StateGlob_filename::got_file(const gchar *filename)
matching = true;
}
+
+ g_regex_unref(pattern);
} else if (colon_modified) {
/*
* Match pattern against directory contents (globbing),
diff --git a/src/glob.h b/src/glob.h
index 6222520..1577242 100644
--- a/src/glob.h
+++ b/src/glob.h
@@ -18,6 +18,8 @@
#ifndef __GLOB_H
#define __GLOB_H
+#include <string.h>
+
#include <glib.h>
#include <glib/gstdio.h>
@@ -26,29 +28,11 @@
namespace SciTECO {
-/*
- * Auxiliary functions
- */
-static inline bool
-is_glob_pattern(const gchar *str)
-{
- if (!str)
- return false;
-
- while (*str) {
- if (*str == '*' || *str == '?')
- return true;
- str++;
- }
-
- return false;
-}
-
class Globber {
GFileTest test;
gchar *dirname;
GDir *dir;
- GPatternSpec *pattern;
+ GRegex *pattern;
public:
Globber(const gchar *pattern,
@@ -56,6 +40,15 @@ public:
~Globber();
gchar *next(void);
+
+ static inline bool
+ is_pattern(const gchar *str)
+ {
+ return str && strpbrk(str, "*?[") != NULL;
+ }
+
+ static gchar *escape_pattern(const gchar *pattern);
+ static GRegex *compile_pattern(const gchar *pattern);
};
/*
diff --git a/src/parser.cpp b/src/parser.cpp
index a9e9213..1936837 100644
--- a/src/parser.cpp
+++ b/src/parser.cpp
@@ -463,6 +463,10 @@ StateCtlE:
undo.push_obj(qregspec_machine) = new QRegSpecMachine;
set(&&StateCtlEQuote);
break;
+ case 'N':
+ undo.push_obj(qregspec_machine) = new QRegSpecMachine;
+ set(&&StateCtlEN);
+ break;
default:
result = (gchar *)g_malloc(3);
@@ -513,6 +517,17 @@ StateCtlEQuote:
g_free(str);
return true;
+StateCtlEN:
+ if (!qregspec_machine->input(chr, reg))
+ return false;
+
+ undo.push_obj(qregspec_machine) = NULL;
+ set(StateStart);
+ str = reg->get_string();
+ result = Globber::escape_pattern(str);
+ g_free(str);
+ return true;
+
StateEscaped:
set(StateStart);
result = String::chrdup(chr);
diff --git a/src/ring.cpp b/src/ring.cpp
index 702807d..077c58d 100644
--- a/src/ring.cpp
+++ b/src/ring.cpp
@@ -320,7 +320,10 @@ StateEditFile::do_edit(tecoInt id)
* <file> may also be a glob pattern, in which case
* all regular files matching the pattern are opened/edited.
* Globbing is performed exactly the same as the
- * EN command does.
+ * \fBEN\fP command does.
+ * Also refer to the section called
+ * .B Glob Patterns
+ * for more details.
*
* File names of buffers in the ring are normalized
* by making them absolute.
@@ -379,7 +382,7 @@ StateEditFile::got_file(const gchar *filename)
return &States::start;
}
- if (is_glob_pattern(filename)) {
+ if (Globber::is_pattern(filename)) {
Globber globber(filename, G_FILE_TEST_IS_REGULAR);
gchar *globbed_filename;