aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/eol.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/eol.c')
-rw-r--r--src/eol.c461
1 files changed, 461 insertions, 0 deletions
diff --git a/src/eol.c b/src/eol.c
new file mode 100644
index 0000000..44ad021
--- /dev/null
+++ b/src/eol.c
@@ -0,0 +1,461 @@
+/*
+ * Copyright (C) 2012-2021 Robin Haberkorn
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string.h>
+
+#include <glib.h>
+
+#include <Scintilla.h>
+
+#include "sciteco.h"
+#include "eol.h"
+
+const gchar *
+teco_eol_get_seq(gint eol_mode)
+{
+ switch (eol_mode) {
+ case SC_EOL_CRLF:
+ return "\r\n";
+ case SC_EOL_CR:
+ return "\r";
+ case SC_EOL_LF:
+ default:
+ return "\n";
+ }
+}
+
+static inline void
+teco_eol_reader_init(teco_eol_reader_t *ctx)
+{
+ memset(ctx, 0, sizeof(*ctx));
+ ctx->eol_style = -1;
+}
+
+static GIOStatus
+teco_eol_reader_read_gio(teco_eol_reader_t *ctx, gsize *read_len, GError **error)
+{
+ return g_io_channel_read_chars(ctx->gio.channel, ctx->gio.buffer,
+ sizeof(ctx->gio.buffer),
+ read_len, error);
+}
+
+/** @memberof teco_eol_reader_t */
+void
+teco_eol_reader_init_gio(teco_eol_reader_t *ctx, GIOChannel *channel)
+{
+ teco_eol_reader_init(ctx);
+ ctx->read_cb = teco_eol_reader_read_gio;
+
+ teco_eol_reader_set_channel(ctx, channel);
+}
+
+static GIOStatus
+teco_eol_reader_read_mem(teco_eol_reader_t *ctx, gsize *read_len, GError **error)
+{
+ *read_len = ctx->mem.len;
+ ctx->mem.len = 0;
+ /*
+ * On the first call, returns G_IO_STATUS_NORMAL,
+ * later G_IO_STATUS_EOF.
+ */
+ return *read_len != 0 ? G_IO_STATUS_NORMAL : G_IO_STATUS_EOF;
+}
+
+/** @memberof teco_eol_reader_t */
+void
+teco_eol_reader_init_mem(teco_eol_reader_t *ctx, gchar *buffer, gsize len)
+{
+ teco_eol_reader_init(ctx);
+ ctx->read_cb = teco_eol_reader_read_mem;
+
+ ctx->mem.buffer = buffer;
+ ctx->mem.len = len;
+}
+
+/**
+ * Read data with automatic EOL translation.
+ *
+ * This gets the next data block from the converter
+ * implementation, performs EOL translation (if enabled)
+ * in a more or less efficient manner and returns
+ * a chunk of EOL-normalized data.
+ *
+ * Since the underlying data source may have to be
+ * queried repeatedly and because the EOL Reader avoids
+ * reassembling the EOL-normalized data by returning
+ * references into the modified data source, it is
+ * necessary to call this function repeatedly until
+ * it returns G_IO_STATUS_EOF.
+ *
+ * @param ctx The EOL Reader object.
+ * @param ret Location to store a pointer to the converted chunk.
+ * The EOL-converted data is NOT null-terminated.
+ * @param data_len A pointer to the length of the converted chunk.
+ * @param error A GError.
+ * @return The status of the conversion.
+ *
+ * @memberof teco_eol_reader_t
+ */
+GIOStatus
+teco_eol_reader_convert(teco_eol_reader_t *ctx, gchar **ret, gsize *data_len, GError **error)
+{
+ gchar *buffer = ctx->read_cb == teco_eol_reader_read_gio ? ctx->gio.buffer : ctx->mem.buffer;
+
+ if (ctx->last_char < 0) {
+ /* a CRLF was last translated */
+ ctx->block_len++;
+ ctx->last_char = '\n';
+ }
+ ctx->offset += ctx->block_len;
+
+ if (ctx->offset == ctx->read_len) {
+ ctx->offset = 0;
+
+ switch (ctx->read_cb(ctx, &ctx->read_len, error)) {
+ case G_IO_STATUS_ERROR:
+ return G_IO_STATUS_ERROR;
+
+ case G_IO_STATUS_EOF:
+ if (ctx->last_char == '\r') {
+ /*
+ * Very last character read is CR.
+ * If this is the only EOL so far, the
+ * EOL style is MAC.
+ * This is also executed if auto-eol is disabled
+ * but it doesn't hurt.
+ */
+ if (ctx->eol_style < 0)
+ ctx->eol_style = SC_EOL_CR;
+ else if (ctx->eol_style != SC_EOL_CR)
+ ctx->eol_style_inconsistent = TRUE;
+ }
+
+ return G_IO_STATUS_EOF;
+
+ case G_IO_STATUS_NORMAL:
+ case G_IO_STATUS_AGAIN:
+ break;
+ }
+
+ if (!(teco_ed & TECO_ED_AUTOEOL)) {
+ /*
+ * No EOL translation - always return entire
+ * buffer
+ */
+ *data_len = ctx->block_len = ctx->read_len;
+ *ret = buffer;
+ return G_IO_STATUS_NORMAL;
+ }
+ }
+
+ /*
+ * Return data with automatic EOL translation.
+ * Every EOL sequence is normalized to LF and
+ * the first sequence determines the documents
+ * EOL style.
+ * This loop is executed for every byte of the
+ * file/stream, so it was important to optimize
+ * it. Specifically, the number of returns
+ * is minimized by keeping a pointer to
+ * the beginning of a block of data in the buffer
+ * which already has LFs (offset).
+ * Mac EOLs can be converted to UNIX EOLs directly
+ * in the buffer.
+ * So if their EOLs are consistent, the function
+ * will return one block for the entire buffer.
+ * When reading a file with DOS EOLs, there will
+ * be one call per line which is significantly slower.
+ */
+ for (guint i = ctx->offset; i < ctx->read_len; i++) {
+ switch (buffer[i]) {
+ case '\n':
+ if (ctx->last_char == '\r') {
+ if (ctx->eol_style < 0)
+ ctx->eol_style = SC_EOL_CRLF;
+ else if (ctx->eol_style != SC_EOL_CRLF)
+ ctx->eol_style_inconsistent = TRUE;
+
+ /*
+ * Return block. CR has already
+ * been made LF in `buffer`.
+ */
+ *data_len = ctx->block_len = i-ctx->offset;
+ /* next call will skip the CR */
+ ctx->last_char = -1;
+ *ret = buffer + ctx->offset;
+ return G_IO_STATUS_NORMAL;
+ }
+
+ if (ctx->eol_style < 0)
+ ctx->eol_style = SC_EOL_LF;
+ else if (ctx->eol_style != SC_EOL_LF)
+ ctx->eol_style_inconsistent = TRUE;
+ /*
+ * No conversion necessary and no need to
+ * return block yet.
+ */
+ ctx->last_char = '\n';
+ break;
+
+ case '\r':
+ if (ctx->last_char == '\r') {
+ if (ctx->eol_style < 0)
+ ctx->eol_style = SC_EOL_CR;
+ else if (ctx->eol_style != SC_EOL_CR)
+ ctx->eol_style_inconsistent = TRUE;
+ }
+
+ /*
+ * Convert CR to LF in `buffer`.
+ * This way more than one line using
+ * Mac EOLs can be returned at once.
+ */
+ buffer[i] = '\n';
+ ctx->last_char = '\r';
+ break;
+
+ default:
+ if (ctx->last_char == '\r') {
+ if (ctx->eol_style < 0)
+ ctx->eol_style = SC_EOL_CR;
+ else if (ctx->eol_style != SC_EOL_CR)
+ ctx->eol_style_inconsistent = TRUE;
+ }
+ ctx->last_char = buffer[i];
+ break;
+ }
+ }
+
+ /*
+ * Return remaining block.
+ * With UNIX/MAC EOLs, this will usually be the
+ * entire `buffer`
+ */
+ *data_len = ctx->block_len = ctx->read_len-ctx->offset;
+ *ret = buffer + ctx->offset;
+ return G_IO_STATUS_NORMAL;
+}
+
+/** @memberof teco_eol_reader_t */
+GIOStatus
+teco_eol_reader_convert_all(teco_eol_reader_t *ctx, gchar **ret, gsize *out_len, GError **error)
+{
+ gsize buffer_len = ctx->read_cb == teco_eol_reader_read_gio
+ ? sizeof(ctx->gio.buffer) : ctx->mem.len;
+
+ /*
+ * NOTE: Doesn't use teco_string_t to make use of GString's
+ * preallocation feature.
+ */
+ GString *str = g_string_sized_new(buffer_len);
+
+ for (;;) {
+ gchar *data;
+ gsize data_len;
+
+ GIOStatus rc = teco_eol_reader_convert(ctx, &data, &data_len, error);
+ if (rc == G_IO_STATUS_ERROR) {
+ g_string_free(str, TRUE);
+ return G_IO_STATUS_ERROR;
+ }
+ if (rc == G_IO_STATUS_EOF)
+ break;
+
+ g_string_append_len(str, data, data_len);
+ }
+
+ if (out_len)
+ *out_len = str->len;
+ *ret = g_string_free(str, FALSE);
+ return G_IO_STATUS_NORMAL;
+}
+
+/** @memberof teco_eol_reader_t */
+void
+teco_eol_reader_clear(teco_eol_reader_t *ctx)
+{
+ if (ctx->read_cb == teco_eol_reader_read_gio && ctx->gio.channel)
+ g_io_channel_unref(ctx->gio.channel);
+}
+
+static inline void
+teco_eol_writer_init(teco_eol_writer_t *ctx, gint eol_mode)
+{
+ memset(ctx, 0, sizeof(*ctx));
+ ctx->eol_seq = teco_eol_get_seq(eol_mode);
+ ctx->eol_seq_len = strlen(ctx->eol_seq);
+}
+
+static gssize
+teco_eol_writer_write_gio(teco_eol_writer_t *ctx, const gchar *buffer, gsize buffer_len, GError **error)
+{
+ gsize bytes_written;
+
+ switch (g_io_channel_write_chars(ctx->gio.channel, buffer, buffer_len,
+ &bytes_written, error)) {
+ case G_IO_STATUS_ERROR:
+ return -1;
+ case G_IO_STATUS_EOF:
+ case G_IO_STATUS_NORMAL:
+ case G_IO_STATUS_AGAIN:
+ break;
+ }
+
+ return bytes_written;
+}
+
+/** @memberof teco_eol_writer_t */
+void
+teco_eol_writer_init_gio(teco_eol_writer_t *ctx, gint eol_mode, GIOChannel *channel)
+{
+ teco_eol_writer_init(ctx, eol_mode);
+ ctx->write_cb = teco_eol_writer_write_gio;
+ teco_eol_writer_set_channel(ctx, channel);
+}
+
+static gssize
+teco_eol_writer_write_mem(teco_eol_writer_t *ctx, const gchar *buffer, gsize buffer_len, GError **error)
+{
+ g_string_append_len(ctx->mem.str, buffer, buffer_len);
+ return buffer_len;
+}
+
+/**
+ * @note Currently uses GString instead of teco_string_t to allow making use
+ * of preallocation.
+ * On the other hand GString has a higher overhead.
+ *
+ * @memberof teco_eol_writer_t
+ */
+void
+teco_eol_writer_init_mem(teco_eol_writer_t *ctx, gint eol_mode, GString *str)
+{
+ teco_eol_writer_init(ctx, eol_mode);
+ ctx->write_cb = teco_eol_writer_write_mem;
+ ctx->mem.str = str;
+}
+
+/**
+ * Perform EOL-normalization on a buffer (if enabled) and
+ * pass it to the underlying data sink.
+ *
+ * This can be called repeatedly to transform a larger
+ * document - the buffer provided does not have to be
+ * well-formed with regard to EOL sequences.
+ *
+ * @param ctx The EOL Reader object.
+ * @param buffer The buffer to convert.
+ * @param buffer_len The length of the data in buffer.
+ * @param error A GError.
+ * @return The number of bytes consumed/converted from buffer.
+ * A value smaller than 0 is returned in case of errors.
+ *
+ * @memberof teco_eol_writer_t
+ */
+gssize
+teco_eol_writer_convert(teco_eol_writer_t *ctx, const gchar *buffer, gsize buffer_len, GError **error)
+{
+ if (!(teco_ed & TECO_ED_AUTOEOL))
+ /*
+ * Write without EOL-translation:
+ * `state` is not required
+ * NOTE: This throws in case of errors
+ */
+ return ctx->write_cb(ctx, buffer, buffer_len, error);
+
+ /*
+ * Write to stream with EOL-translation.
+ * The document's EOL mode tells us what was guessed
+ * when its content was read in (presumably from a file)
+ * but might have been changed manually by the user.
+ * NOTE: This code assumes that the output stream is
+ * buffered, since otherwise it would be slower
+ * (has been benchmarked).
+ * NOTE: The loop is executed for every character
+ * in `buffer` and has been optimized for minimal
+ * function (i.e. GIOChannel) calls.
+ */
+ guint i = 0;
+ gsize bytes_written = 0;
+ if (ctx->state == TECO_EOL_STATE_WRITE_LF) {
+ /* complete writing a CRLF sequence */
+ gssize rc = ctx->write_cb(ctx, "\n", 1, error);
+ if (rc < 1)
+ /* nothing written or error */
+ return rc;
+ ctx->state = TECO_EOL_STATE_START;
+ bytes_written++;
+ i++;
+ }
+
+ guint block_start = i;
+ gssize block_written;
+ while (i < buffer_len) {
+ switch (buffer[i]) {
+ case '\n':
+ if (ctx->last_c == '\r') {
+ /* EOL sequence already written */
+ bytes_written++;
+ block_start = i+1;
+ break;
+ }
+ /* fall through */
+ case '\r':
+ block_written = ctx->write_cb(ctx, buffer+block_start, i-block_start, error);
+ if (block_written < 0)
+ return -1;
+ bytes_written += block_written;
+ if (block_written < i-block_start)
+ return bytes_written;
+
+ block_written = ctx->write_cb(ctx, ctx->eol_seq, ctx->eol_seq_len, error);
+ if (block_written < 0)
+ return -1;
+ if (block_written == 0)
+ return bytes_written;
+ if (block_written < ctx->eol_seq_len) {
+ /* incomplete EOL seq - we have written CR of CRLF */
+ ctx->state = TECO_EOL_STATE_WRITE_LF;
+ return bytes_written;
+ }
+ bytes_written++;
+
+ block_start = i+1;
+ break;
+ }
+
+ ctx->last_c = buffer[i++];
+ }
+
+ /*
+ * Write out remaining block (i.e. line)
+ */
+ gssize rc = ctx->write_cb(ctx, buffer+block_start, buffer_len-block_start, error);
+ return rc < 0 ? -1 : bytes_written + rc;
+}
+
+/** @memberof teco_eol_writer_t */
+void
+teco_eol_writer_clear(teco_eol_writer_t *ctx)
+{
+ if (ctx->write_cb == teco_eol_writer_write_gio && ctx->gio.channel)
+ g_io_channel_unref(ctx->gio.channel);
+}