aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/glob.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/glob.c')
-rw-r--r--src/glob.c573
1 files changed, 573 insertions, 0 deletions
diff --git a/src/glob.c b/src/glob.c
new file mode 100644
index 0000000..f6810c2
--- /dev/null
+++ b/src/glob.c
@@ -0,0 +1,573 @@
+/*
+ * Copyright (C) 2012-2021 Robin Haberkorn
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string.h>
+
+#include <glib.h>
+#include <glib/gprintf.h>
+#include <glib/gstdio.h>
+
+#include "sciteco.h"
+#include "string-utils.h"
+#include "file-utils.h"
+#include "interface.h"
+#include "parser.h"
+#include "core-commands.h"
+#include "expressions.h"
+#include "qreg.h"
+#include "ring.h"
+#include "error.h"
+#include "glob.h"
+
+/*
+ * FIXME: This state could be static.
+ */
+TECO_DECLARE_STATE(teco_state_glob_filename);
+
+/** @memberof teco_globber_t */
+void
+teco_globber_init(teco_globber_t *ctx, const gchar *pattern, GFileTest test)
+{
+ memset(ctx, 0, sizeof(*ctx));
+ ctx->test = test;
+
+ /*
+ * This finds the directory component including
+ * any trailing directory separator
+ * without making up a directory if it is missing
+ * (as g_path_get_dirname() does).
+ * Important since it allows us to construct
+ * file names with the exact same directory
+ * prefix as the input pattern.
+ */
+ gsize dirname_len = teco_file_get_dirname_len(pattern);
+ ctx->dirname = g_strndup(pattern, dirname_len);
+
+ ctx->dir = g_dir_open(*ctx->dirname ? ctx->dirname : ".", 0, NULL);
+ /* if dirname does not exist, the result may be NULL */
+
+ ctx->pattern = teco_globber_compile_pattern(pattern + dirname_len);
+}
+
+/** @memberof teco_globber_t */
+gchar *
+teco_globber_next(teco_globber_t *ctx)
+{
+ const gchar *basename;
+
+ if (!ctx->dir)
+ return NULL;
+
+ while ((basename = g_dir_read_name(ctx->dir))) {
+ if (!g_regex_match(ctx->pattern, basename, 0, NULL))
+ continue;
+
+ /*
+ * As dirname includes the directory separator,
+ * we can simply concatenate dirname with basename.
+ */
+ gchar *filename = g_strconcat(ctx->dirname, basename, NULL);
+
+ /*
+ * No need to perform file test for EXISTS since
+ * g_dir_read_name() will only return existing entries
+ */
+ if (ctx->test == G_FILE_TEST_EXISTS || g_file_test(filename, ctx->test))
+ return filename;
+
+ g_free(filename);
+ }
+
+ return NULL;
+}
+
+/** @memberof teco_globber_t */
+void
+teco_globber_clear(teco_globber_t *ctx)
+{
+ if (ctx->pattern)
+ g_regex_unref(ctx->pattern);
+ if (ctx->dir)
+ g_dir_close(ctx->dir);
+ g_free(ctx->dirname);
+}
+
+/** @static @memberof teco_globber_t */
+gchar *
+teco_globber_escape_pattern(const gchar *pattern)
+{
+ gsize escaped_len = 1;
+ gchar *escaped, *pout;
+
+ /*
+ * NOTE: The exact size of the escaped string is easy to calculate
+ * in O(n) just like strlen(pattern), so we can just as well
+ * do that.
+ */
+ for (const gchar *pin = pattern; *pin; pin++) {
+ switch (*pin) {
+ case '*':
+ case '?':
+ case '[':
+ escaped_len += 3;
+ break;
+ default:
+ escaped_len++;
+ break;
+ }
+ }
+ pout = escaped = g_malloc(escaped_len);
+
+ while (*pattern) {
+ switch (*pattern) {
+ case '*':
+ case '?':
+ case '[':
+ *pout++ = '[';
+ *pout++ = *pattern;
+ *pout++ = ']';
+ break;
+ default:
+ *pout++ = *pattern;
+ break;
+ }
+
+ pattern++;
+ }
+ *pout = '\0';
+
+ return escaped;
+}
+
+/**
+ * Compile a fnmatch(3)-compatible glob pattern to
+ * a PCRE regular expression.
+ *
+ * There is GPattern, but it only supports the
+ * "*" and "?" wildcards which most importantly
+ * do not allow escaping.
+ *
+ * @param pattern The pattern to compile.
+ * @return A new compiled regular expression object.
+ * Always non-NULL. Unref after use.
+ *
+ * @static @memberof teco_globber_t
+ */
+GRegex *
+teco_globber_compile_pattern(const gchar *pattern)
+{
+ enum {
+ STATE_WILDCARD,
+ STATE_CLASS_START,
+ STATE_CLASS_NEGATE,
+ STATE_CLASS
+ } state = STATE_WILDCARD;
+
+ /*
+ * NOTE: The conversion to regex needs at most two
+ * characters per input character and the regex pattern
+ * is required only temporarily, so we use a fixed size
+ * buffer avoiding reallocations but wasting a few bytes
+ * (determining the exact required space would be tricky).
+ * It is not allocated on the stack though since pattern
+ * might be arbitrary user input and we must avoid
+ * stack overflows at all costs.
+ */
+ g_autofree gchar *pattern_regex = g_malloc(strlen(pattern)*2 + 1 + 1);
+ gchar *pout = pattern_regex;
+
+ while (*pattern) {
+ if (state == STATE_WILDCARD) {
+ /*
+ * Outside a character class/set.
+ */
+ switch (*pattern) {
+ case '*':
+ *pout++ = '.';
+ *pout++ = '*';
+ break;
+ case '?':
+ *pout++ = '.';
+ break;
+ case '[':
+ /*
+ * The special case of an unclosed character
+ * class is allowed in fnmatch(3) but invalid
+ * in PCRE, so we must check for it explicitly.
+ * FIXME: This is sort of inefficient...
+ */
+ if (strchr(pattern, ']')) {
+ state = STATE_CLASS_START;
+ *pout++ = '[';
+ break;
+ }
+ /* fall through */
+ default:
+ /*
+ * For simplicity, all non-alphanumeric
+ * characters are escaped since they could
+ * be PCRE magic characters.
+ * g_regex_escape_string() is inefficient.
+ * character anyway.
+ */
+ if (!g_ascii_isalnum(*pattern))
+ *pout++ = '\\';
+ *pout++ = *pattern;
+ break;
+ }
+ } else {
+ /*
+ * Within a character class/set.
+ */
+ switch (*pattern) {
+ case '!':
+ /*
+ * fnmatch(3) allows ! instead of ^ immediately
+ * after the opening bracket.
+ */
+ if (state > STATE_CLASS_START) {
+ state = STATE_CLASS;
+ *pout++ = '!';
+ break;
+ }
+ /* fall through */
+ case '^':
+ state = state == STATE_CLASS_START
+ ? STATE_CLASS_NEGATE : STATE_CLASS;
+ *pout++ = '^';
+ break;
+ case ']':
+ /*
+ * fnmatch(3) allows the closing bracket as the
+ * first character to include it in the set, while
+ * PCRE requires it to be escaped.
+ */
+ if (state == STATE_CLASS) {
+ state = STATE_WILDCARD;
+ *pout++ = ']';
+ break;
+ }
+ /* fall through */
+ default:
+ if (!g_ascii_isalnum(*pattern))
+ *pout++ = '\\';
+ /* fall through */
+ case '-':
+ state = STATE_CLASS;
+ *pout++ = *pattern;
+ break;
+ }
+ }
+
+ pattern++;
+ }
+ *pout++ = '$';
+ *pout = '\0';
+
+ GRegex *pattern_compiled = g_regex_new(pattern_regex,
+ G_REGEX_DOTALL | G_REGEX_ANCHORED, 0, NULL);
+ /*
+ * Since the regex is generated from patterns that are
+ * always valid, there must be no syntactic error.
+ */
+ g_assert(pattern_compiled != NULL);
+
+ return pattern_compiled;
+}
+
+/*
+ * Command States
+ */
+
+static teco_state_t *
+teco_state_glob_pattern_done(teco_machine_main_t *ctx, const teco_string_t *str, GError **error)
+{
+ if (ctx->mode > TECO_MODE_NORMAL)
+ return &teco_state_glob_filename;
+
+ if (str->len > 0) {
+ g_autofree gchar *filename = teco_file_expand_path(str->data);
+
+ teco_qreg_t *glob_reg = teco_qreg_table_find(&teco_qreg_table_globals, "_", 1);
+ g_assert(glob_reg != NULL);
+ if (!glob_reg->vtable->undo_set_string(glob_reg, error) ||
+ !glob_reg->vtable->set_string(glob_reg, filename, strlen(filename), error))
+ return NULL;
+ }
+
+ return &teco_state_glob_filename;
+}
+
+/*$ EN glob
+ * [type]EN[pattern]$[filename]$ -- Glob files or match filename and check file type
+ * [type]:EN[pattern]$[filename]$ -> Success|Failure
+ *
+ * EN is a powerful command for performing various tasks
+ * given a glob \fIpattern\fP.
+ * For a description of the glob pattern syntax, refer to the section
+ * .B Glob Patterns
+ * for details.
+ *
+ * \fIpattern\fP may be omitted, in which case it defaults
+ * to the pattern saved in the search and glob register \(lq_\(rq.
+ * If it is specified, it overwrites the contents of the register
+ * \(lq_\(rq with \fIpattern\fP.
+ * This behaviour is similar to the search and replace commands
+ * and allows for repeated globbing/matching with the same
+ * pattern.
+ * Therefoe you should also save the \(lq_\(rq register on the
+ * Q-Register stack when calling EN from portable macros.
+ *
+ * If \fIfilename\fP is omitted (empty), EN may be used to expand
+ * a glob \fIpattern\fP to a list of matching file names.
+ * This is similar to globbing
+ * on UNIX but not as powerful and may be used e.g. for
+ * iterating over directory contents.
+ * E.g. \(lqEN*.c\fB$$\fP\(rq expands to all \(lq.c\(rq files
+ * in the current directory.
+ * The resulting file names have the exact same directory
+ * component as \fIpattern\fP (if any).
+ * Without \fIfilename\fP, EN will currently only match files
+ * in the file name component
+ * of \fIpattern\fP, not on each component of the path name
+ * separately.
+ * In other words, EN only looks through the directory
+ * of \fIpattern\fP \(em you cannot effectively match
+ * multiple directories.
+ *
+ * If \fIfilename\fP is specified, \fIpattern\fP will only
+ * be matched against that single file name.
+ * If it matches, \fIfilename\fP is used verbatim.
+ * In this form, \fIpattern\fP is matched against the entire
+ * file name, so it is possible to match directory components
+ * as well.
+ * \fIfilename\fP does not necessarily have to exist in the
+ * file system for the match to succeed (unless a file type check
+ * is also specified).
+ * For instance, \(lqENf??/\[**].c\fB$\fPfoo/bar.c\fB$\fP\(rq will
+ * always match and the string \(lqfoo/bar.c\(rq will be inserted
+ * (see below).
+ *
+ * By default, if EN is not colon-modified, the result of
+ * globbing or file name matching is inserted into the current
+ * document, at the current position.
+ * The file names will be separated by line feeds, i.e.
+ * every matching file will be on its own line.
+ *
+ * EN may be colon-modified to avoid any text insertion.
+ * Instead, a boolean is returned that signals whether
+ * any file matched \fIpattern\fP.
+ * E.g. \(lq:EN*.c\fB$$\fP\(rq returns success (-1) if
+ * there is at least one \(lq.c\(rq file in the current directory.
+ *
+ * The results of EN may be filtered by specifying a numeric file
+ * \fItype\fP check argument.
+ * This argument may be omitted (as in the examples above) and defaults
+ * to 0, i.e. no additional checking.
+ * The following file type check values are currently defined:
+ * .IP 0 4
+ * No file type checking is performed.
+ * Note however, that when globbing only directory contents
+ * (of any type) are used, so without the \fIfilename\fP
+ * argument, the value 0 is equivalent to 5.
+ * .IP 1
+ * Only match \fIregular files\fP (no directories).
+ * Will also match symlinks to regular files (on platforms
+ * supporting symlinks).
+ * .IP 2
+ * Only match \fIsymlinks\fP.
+ * On platforms without symlinks (non-UNIX), this will never
+ * match anything.
+ * .IP 3
+ * Only match \fIdirectories\fP.
+ * .IP 4
+ * Only match \fIexecutables\fP.
+ * On UNIX, the executable flag is evaluated, while on
+ * Windows only the file name is checked.
+ * .IP 5
+ * Only match existing files or directories.
+ * When globbing, this check makes no sense and is
+ * equivalent to no check at all.
+ * It may however be used to test that a filename refers
+ * to an existing file.
+ *
+ * For instance, \(lq3EN*\fB$$\fP\(rq will expand to
+ * all subdirectories in the current directory.
+ * The following idiom may be used to check whether
+ * a given filename refers to a regular file:
+ * 1:EN*\fB$\fIfilename\fB$\fR
+ *
+ * Note that both without colon and colon modified
+ * forms of EN save the success or failure of the
+ * operation in the numeric part of the glob register
+ * \(lq_\(rq (i.e. the same value that the colon modified
+ * form would return).
+ * The command itself never fails because of failure
+ * in matching any files.
+ * E.g. if \(lqEN*.c\fB$$\fP\(rq does not match any
+ * files, the EN command is still successful but does
+ * not insert anything. A failure boolean would be saved
+ * in \(lq_\(rq, though.
+ *
+ * String-building characters are enabled for EN and
+ * both string arguments are considered file names
+ * with regard to auto-completions.
+ */
+/*
+ * NOTE: This does not work like classic TECO's
+ * EN command (iterative globbing), since the
+ * position in the directory cannot be reasonably
+ * reset on rubout with glib's API.
+ * If we have to perform all the globbing on initialization
+ * we can just as well return all the results at once.
+ * And we can add them to the current document since
+ * when they should be in a register, the user will
+ * have to edit that register anyway.
+ */
+TECO_DEFINE_STATE_EXPECTFILE(teco_state_glob_pattern,
+ .expectstring.last = FALSE
+);
+
+static teco_state_t *
+teco_state_glob_filename_done(teco_machine_main_t *ctx, const teco_string_t *str, GError **error)
+{
+ if (ctx->mode > TECO_MODE_NORMAL)
+ return &teco_state_start;
+
+ GFileTest file_flags = G_FILE_TEST_EXISTS;
+
+ gboolean matching = FALSE;
+ gboolean colon_modified = teco_machine_main_eval_colon(ctx);
+
+ teco_int_t teco_test_mode;
+
+ if (!teco_expressions_eval(FALSE, error) ||
+ !teco_expressions_pop_num_calc(&teco_test_mode, 0, error))
+ return NULL;
+ switch (teco_test_mode) {
+ /*
+ * 0 means, no file testing.
+ * file_flags will still be G_FILE_TEST_EXISTS which
+ * is equivalent to no testing when using the Globber class.
+ */
+ case 0: break;
+ case 1: file_flags = G_FILE_TEST_IS_REGULAR; break;
+ case 2: file_flags = G_FILE_TEST_IS_SYMLINK; break;
+ case 3: file_flags = G_FILE_TEST_IS_DIR; break;
+ case 4: file_flags = G_FILE_TEST_IS_EXECUTABLE; break;
+ case 5: file_flags = G_FILE_TEST_EXISTS; break;
+ default:
+ g_set_error(error, TECO_ERROR, TECO_ERROR_FAILED,
+ "Invalid file test %" TECO_INT_FORMAT " for <EN>",
+ teco_test_mode);
+ return NULL;
+ }
+
+ teco_qreg_t *glob_reg = teco_qreg_table_find(&teco_qreg_table_globals, "_", 1);
+ g_assert(glob_reg != NULL);
+ g_auto(teco_string_t) pattern_str = {NULL, 0};
+ if (!glob_reg->vtable->get_string(glob_reg, &pattern_str.data, &pattern_str.len, error))
+ return NULL;
+ if (teco_string_contains(&pattern_str, '\0')) {
+ teco_error_qregcontainsnull_set(error, "_", 1, FALSE);
+ return NULL;
+ }
+
+ if (str->len > 0) {
+ /*
+ * Match pattern against provided file name
+ */
+ g_autofree gchar *filename = teco_file_expand_path(str->data);
+ g_autoptr(GRegex) pattern = teco_globber_compile_pattern(pattern_str.data);
+
+ if (g_regex_match(pattern, filename, 0, NULL) &&
+ (teco_test_mode == 0 || g_file_test(filename, file_flags))) {
+ if (!colon_modified) {
+ /*
+ * FIXME: Filenames may contain linefeeds.
+ * But if we add them null-terminated, they will be relatively hard to parse.
+ */
+ teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0);
+ teco_interface_ssm(SCI_ADDTEXT, strlen(filename),
+ (sptr_t)filename);
+ teco_interface_ssm(SCI_ADDTEXT, 1, (sptr_t)"\n");
+ teco_interface_ssm(SCI_SCROLLCARET, 0, 0);
+ teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0);
+ }
+
+ matching = TRUE;
+ }
+ } else if (colon_modified) {
+ /*
+ * Match pattern against directory contents (globbing),
+ * returning TECO_SUCCESS if at least one file matches
+ */
+ g_auto(teco_globber_t) globber;
+
+ teco_globber_init(&globber, pattern_str.data, file_flags);
+ g_autofree gchar *globbed_filename = teco_globber_next(&globber);
+
+ matching = globbed_filename != NULL;
+ } else {
+ /*
+ * Match pattern against directory contents (globbing),
+ * inserting all matching file names (null-byte-terminated)
+ */
+ g_auto(teco_globber_t) globber;
+ teco_globber_init(&globber, pattern_str.data, file_flags);
+
+ teco_interface_ssm(SCI_BEGINUNDOACTION, 0, 0);
+
+ gchar *globbed_filename;
+ while ((globbed_filename = teco_globber_next(&globber))) {
+ /*
+ * FIXME: Filenames may contain linefeeds.
+ * But if we add them null-terminated, they will be relatively hard to parse.
+ */
+ teco_interface_ssm(SCI_ADDTEXT, strlen(globbed_filename),
+ (sptr_t)globbed_filename);
+ teco_interface_ssm(SCI_ADDTEXT, 1, (sptr_t)"\n");
+
+ g_free(globbed_filename);
+ matching = TRUE;
+ }
+
+ teco_interface_ssm(SCI_SCROLLCARET, 0, 0);
+ teco_interface_ssm(SCI_ENDUNDOACTION, 0, 0);
+ }
+
+ if (colon_modified) {
+ teco_expressions_push(teco_bool(matching));
+ } else if (matching) {
+ /* text has been inserted */
+ teco_ring_dirtify();
+ if (teco_current_doc_must_undo())
+ undo__teco_interface_ssm(SCI_UNDO, 0, 0);
+ }
+
+ if (!glob_reg->vtable->undo_set_integer(glob_reg, error) ||
+ !glob_reg->vtable->set_integer(glob_reg, teco_bool(matching), error))
+ return NULL;
+
+ return &teco_state_start;
+}
+
+TECO_DEFINE_STATE_EXPECTFILE(teco_state_glob_filename);