diff options
author | Robin Haberkorn <robin.haberkorn@googlemail.com> | 2016-11-01 06:58:18 +0100 |
---|---|---|
committer | Robin Haberkorn <robin.haberkorn@googlemail.com> | 2016-11-01 07:23:49 +0100 |
commit | 9f6cba5c0370aee2f9803abbc35ab7e67f57ee84 (patch) | |
tree | b03485f177d6ff700aac7fc8ff1e7e9e23a61866 /src | |
parent | b5e6f4c61b7b8e220fb3faa071e30b3dfc559f2f (diff) | |
download | sciteco-9f6cba5c0370aee2f9803abbc35ab7e67f57ee84.tar.gz |
globbing supports character classes now and ^EN string building construct to escape glob patterns
* globbing is fnmatch(3) compatible, now on every supported platform.
* which means that escaping of glob patterns is possible now.
^ENq has been introduced to ease this task.
* This finally allows you to pass unmodified filenames to EB.
Previously it was impossible to open file names containing glob wildcards.
* this was achieved by moving from GPattern to GRegex as the underlying
implementation.
* The glob pattern is converted to a regular expression before being
compiled to a GRegex.
This turned out to be trickier than anticipated (~140 lines of code)
and has a runtime penalty of course (complexity is O(2*n) over the
pattern length).
It is IMHO still better than the alternatives, like importing
external code from libiberty, which is potentially non-cross-platform.
* Using GRegex also opens the potential of supporting brace "expansions"
later in the form of glob pattern constructs
(they won't actually expand but match alternatives).
* is_glob_pattern() has been simplified and moved to Globber::is_pattern().
It makes sense to reuse the Globber class namespace instead of using
plain functions for functions working on glob patterns.
* The documentation has a new subsection on glob patterns now.
* Testsuite extended with glob pattern test cases
Diffstat (limited to 'src')
-rw-r--r-- | src/cmdline.cpp | 2 | ||||
-rw-r--r-- | src/glob.cpp | 203 | ||||
-rw-r--r-- | src/glob.h | 31 | ||||
-rw-r--r-- | src/parser.cpp | 15 | ||||
-rw-r--r-- | src/ring.cpp | 7 |
5 files changed, 228 insertions, 30 deletions
diff --git a/src/cmdline.cpp b/src/cmdline.cpp index 8c5bc42..19a3c66 100644 --- a/src/cmdline.cpp +++ b/src/cmdline.cpp @@ -619,7 +619,7 @@ filename_complete(const gchar *filename, gchar completed, gchar *insert = NULL; gsize prefix_len = 0; - if (is_glob_pattern(filename)) + if (Globber::is_pattern(filename)) return NULL; filename_expanded = expand_path(filename); diff --git a/src/glob.cpp b/src/glob.cpp index 2806fbf..cb9633f 100644 --- a/src/glob.cpp +++ b/src/glob.cpp @@ -61,7 +61,7 @@ Globber::Globber(const gchar *pattern, GFileTest _test) dir = g_dir_open(*dirname ? dirname : ".", 0, NULL); /* if dirname does not exist, dir may be NULL */ - Globber::pattern = g_pattern_spec_new(pattern + dirname_len); + Globber::pattern = compile_pattern(pattern + dirname_len); } gchar * @@ -75,7 +75,7 @@ Globber::next(void) while ((basename = g_dir_read_name(dir))) { gchar *filename; - if (!g_pattern_match_string(pattern, basename)) + if (!g_regex_match(pattern, basename, (GRegexMatchFlags)0, NULL)) continue; /* @@ -100,12 +100,196 @@ Globber::next(void) Globber::~Globber() { if (pattern) - g_pattern_spec_free(pattern); + g_regex_unref(pattern); if (dir) g_dir_close(dir); g_free(dirname); } +gchar * +Globber::escape_pattern(const gchar *pattern) +{ + gsize escaped_len = 1; + gchar *escaped, *pout; + + /* + * NOTE: The exact size of the escaped string is easy to calculate + * in O(n) just like strlen(pattern), so we can just as well + * do that. + */ + for (const gchar *pin = pattern; *pin; pin++) { + switch (*pin) { + case '*': + case '?': + case '[': + escaped_len += 3; + break; + default: + escaped_len++; + break; + } + } + pout = escaped = (gchar *)g_malloc(escaped_len); + + while (*pattern) { + switch (*pattern) { + case '*': + case '?': + case '[': + *pout++ = '['; + *pout++ = *pattern; + *pout++ = ']'; + break; + default: + *pout++ = *pattern; + break; + } + + pattern++; + } + *pout = '\0'; + + return escaped; +} + +/** + * Compile a fnmatch(3)-compatible glob pattern to + * a PCRE regular expression. + * + * There is GPattern, but it only supports the + * "*" and "?" wildcards which most importantly + * do not allow escaping. + * + * @param pattern The pattern to compile. + * @return A new compiled regular expression object. + * Always non-NULL. Unref after use. + */ +GRegex * +Globber::compile_pattern(const gchar *pattern) +{ + gchar *pattern_regex, *pout; + GRegex *pattern_compiled; + + enum { + STATE_WILDCARD, + STATE_CLASS_START, + STATE_CLASS_NEGATE, + STATE_CLASS + } state = STATE_WILDCARD; + + /* + * NOTE: The conversion to regex needs at most two + * characters per input character and the regex pattern + * is required only temporarily, so we use a fixed size + * buffer avoiding reallocations but wasting a few bytes + * (determining the exact required space would be tricky). + * It is not allocated on the stack though since pattern + * might be arbitrary user input and we must avoid + * stack overflows at all costs. + */ + pout = pattern_regex = (gchar *)g_malloc(strlen(pattern)*2 + 1 + 1); + + while (*pattern) { + if (state == STATE_WILDCARD) { + /* + * Outside a character class/set. + */ + switch (*pattern) { + case '*': + *pout++ = '.'; + *pout++ = '*'; + break; + case '?': + *pout++ = '.'; + break; + case '[': + /* + * The special case of an unclosed character + * class is allowed in fnmatch(3) but invalid + * in PCRE, so we must check for it explicitly. + * FIXME: This is sort of inefficient... + */ + if (strchr(pattern, ']')) { + state = STATE_CLASS_START; + *pout++ = '['; + break; + } + /* fall through */ + default: + /* + * For simplicity, all non-alphanumeric + * characters are escaped since they could + * be PCRE magic characters. + * g_regex_escape_string() is inefficient. + * character anyway. + */ + if (!g_ascii_isalnum(*pattern)) + *pout++ = '\\'; + *pout++ = *pattern; + break; + } + } else { + /* + * Within a character class/set. + */ + switch (*pattern) { + case '!': + /* + * fnmatch(3) allows ! instead of ^ immediately + * after the opening bracket. + */ + if (state > STATE_CLASS_START) { + state = STATE_CLASS; + *pout++ = '!'; + break; + } + /* fall through */ + case '^': + state = state == STATE_CLASS_START + ? STATE_CLASS_NEGATE : STATE_CLASS; + *pout++ = '^'; + break; + case ']': + /* + * fnmatch(3) allows the closing bracket as the + * first character to include it in the set, while + * PCRE requires it to be escaped. + */ + if (state == STATE_CLASS) { + state = STATE_WILDCARD; + *pout++ = ']'; + break; + } + /* fall through */ + default: + if (!g_ascii_isalnum(*pattern)) + *pout++ = '\\'; + /* fall through */ + case '-': + state = STATE_CLASS; + *pout++ = *pattern; + break; + } + } + + pattern++; + } + *pout++ = '$'; + *pout = '\0'; + + pattern_compiled = g_regex_new(pattern_regex, + (GRegexCompileFlags)(G_REGEX_DOTALL | G_REGEX_ANCHORED), + (GRegexMatchFlags)0, NULL); + /* + * Since the regex is generated from patterns that are + * always valid, there must be no syntactic error. + */ + g_assert(pattern_compiled != NULL); + + g_free(pattern_regex); + return pattern_compiled; +} + /* * Command States */ @@ -116,10 +300,9 @@ Globber::~Globber() * * EN is a powerful command for performing various tasks * given a glob \fIpattern\fP. - * A \fIpattern\fP is a file name with \(lq*\(rq and - * \(lq?\(rq wildcards: - * \(lq*\(rq matches an arbitrary, possibly empty, string. - * \(lq?\(rq matches an arbitrary character. + * For a description of the glob pattern syntax, refer to the section + * .B Glob Patterns + * for details. * * \fIpattern\fP may be omitted, in which case it defaults * to the pattern saved in the search and glob register \(lq_\(rq. @@ -291,7 +474,9 @@ StateGlob_filename::got_file(const gchar *filename) /* * Match pattern against provided file name */ - if (g_pattern_match_simple(pattern_str, filename) && + GRegex *pattern = Globber::compile_pattern(pattern_str); + + if (g_regex_match(pattern, filename, (GRegexMatchFlags)0, NULL) && (!teco_test_mode || g_file_test(filename, file_flags))) { if (!colon_modified) { interface.ssm(SCI_BEGINUNDOACTION); @@ -304,6 +489,8 @@ StateGlob_filename::got_file(const gchar *filename) matching = true; } + + g_regex_unref(pattern); } else if (colon_modified) { /* * Match pattern against directory contents (globbing), @@ -18,6 +18,8 @@ #ifndef __GLOB_H #define __GLOB_H +#include <string.h> + #include <glib.h> #include <glib/gstdio.h> @@ -26,29 +28,11 @@ namespace SciTECO { -/* - * Auxiliary functions - */ -static inline bool -is_glob_pattern(const gchar *str) -{ - if (!str) - return false; - - while (*str) { - if (*str == '*' || *str == '?') - return true; - str++; - } - - return false; -} - class Globber { GFileTest test; gchar *dirname; GDir *dir; - GPatternSpec *pattern; + GRegex *pattern; public: Globber(const gchar *pattern, @@ -56,6 +40,15 @@ public: ~Globber(); gchar *next(void); + + static inline bool + is_pattern(const gchar *str) + { + return str && strpbrk(str, "*?[") != NULL; + } + + static gchar *escape_pattern(const gchar *pattern); + static GRegex *compile_pattern(const gchar *pattern); }; /* diff --git a/src/parser.cpp b/src/parser.cpp index a9e9213..1936837 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -463,6 +463,10 @@ StateCtlE: undo.push_obj(qregspec_machine) = new QRegSpecMachine; set(&&StateCtlEQuote); break; + case 'N': + undo.push_obj(qregspec_machine) = new QRegSpecMachine; + set(&&StateCtlEN); + break; default: result = (gchar *)g_malloc(3); @@ -513,6 +517,17 @@ StateCtlEQuote: g_free(str); return true; +StateCtlEN: + if (!qregspec_machine->input(chr, reg)) + return false; + + undo.push_obj(qregspec_machine) = NULL; + set(StateStart); + str = reg->get_string(); + result = Globber::escape_pattern(str); + g_free(str); + return true; + StateEscaped: set(StateStart); result = String::chrdup(chr); diff --git a/src/ring.cpp b/src/ring.cpp index 702807d..077c58d 100644 --- a/src/ring.cpp +++ b/src/ring.cpp @@ -320,7 +320,10 @@ StateEditFile::do_edit(tecoInt id) * <file> may also be a glob pattern, in which case * all regular files matching the pattern are opened/edited. * Globbing is performed exactly the same as the - * EN command does. + * \fBEN\fP command does. + * Also refer to the section called + * .B Glob Patterns + * for more details. * * File names of buffers in the ring are normalized * by making them absolute. @@ -379,7 +382,7 @@ StateEditFile::got_file(const gchar *filename) return &States::start; } - if (is_glob_pattern(filename)) { + if (Globber::is_pattern(filename)) { Globber globber(filename, G_FILE_TEST_IS_REGULAR); gchar *globbed_filename; |