diff options
Diffstat (limited to 'src/eol.c')
-rw-r--r-- | src/eol.c | 461 |
1 files changed, 461 insertions, 0 deletions
diff --git a/src/eol.c b/src/eol.c new file mode 100644 index 0000000..44ad021 --- /dev/null +++ b/src/eol.c @@ -0,0 +1,461 @@ +/* + * Copyright (C) 2012-2021 Robin Haberkorn + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <string.h> + +#include <glib.h> + +#include <Scintilla.h> + +#include "sciteco.h" +#include "eol.h" + +const gchar * +teco_eol_get_seq(gint eol_mode) +{ + switch (eol_mode) { + case SC_EOL_CRLF: + return "\r\n"; + case SC_EOL_CR: + return "\r"; + case SC_EOL_LF: + default: + return "\n"; + } +} + +static inline void +teco_eol_reader_init(teco_eol_reader_t *ctx) +{ + memset(ctx, 0, sizeof(*ctx)); + ctx->eol_style = -1; +} + +static GIOStatus +teco_eol_reader_read_gio(teco_eol_reader_t *ctx, gsize *read_len, GError **error) +{ + return g_io_channel_read_chars(ctx->gio.channel, ctx->gio.buffer, + sizeof(ctx->gio.buffer), + read_len, error); +} + +/** @memberof teco_eol_reader_t */ +void +teco_eol_reader_init_gio(teco_eol_reader_t *ctx, GIOChannel *channel) +{ + teco_eol_reader_init(ctx); + ctx->read_cb = teco_eol_reader_read_gio; + + teco_eol_reader_set_channel(ctx, channel); +} + +static GIOStatus +teco_eol_reader_read_mem(teco_eol_reader_t *ctx, gsize *read_len, GError **error) +{ + *read_len = ctx->mem.len; + ctx->mem.len = 0; + /* + * On the first call, returns G_IO_STATUS_NORMAL, + * later G_IO_STATUS_EOF. + */ + return *read_len != 0 ? G_IO_STATUS_NORMAL : G_IO_STATUS_EOF; +} + +/** @memberof teco_eol_reader_t */ +void +teco_eol_reader_init_mem(teco_eol_reader_t *ctx, gchar *buffer, gsize len) +{ + teco_eol_reader_init(ctx); + ctx->read_cb = teco_eol_reader_read_mem; + + ctx->mem.buffer = buffer; + ctx->mem.len = len; +} + +/** + * Read data with automatic EOL translation. + * + * This gets the next data block from the converter + * implementation, performs EOL translation (if enabled) + * in a more or less efficient manner and returns + * a chunk of EOL-normalized data. + * + * Since the underlying data source may have to be + * queried repeatedly and because the EOL Reader avoids + * reassembling the EOL-normalized data by returning + * references into the modified data source, it is + * necessary to call this function repeatedly until + * it returns G_IO_STATUS_EOF. + * + * @param ctx The EOL Reader object. + * @param ret Location to store a pointer to the converted chunk. + * The EOL-converted data is NOT null-terminated. + * @param data_len A pointer to the length of the converted chunk. + * @param error A GError. + * @return The status of the conversion. + * + * @memberof teco_eol_reader_t + */ +GIOStatus +teco_eol_reader_convert(teco_eol_reader_t *ctx, gchar **ret, gsize *data_len, GError **error) +{ + gchar *buffer = ctx->read_cb == teco_eol_reader_read_gio ? ctx->gio.buffer : ctx->mem.buffer; + + if (ctx->last_char < 0) { + /* a CRLF was last translated */ + ctx->block_len++; + ctx->last_char = '\n'; + } + ctx->offset += ctx->block_len; + + if (ctx->offset == ctx->read_len) { + ctx->offset = 0; + + switch (ctx->read_cb(ctx, &ctx->read_len, error)) { + case G_IO_STATUS_ERROR: + return G_IO_STATUS_ERROR; + + case G_IO_STATUS_EOF: + if (ctx->last_char == '\r') { + /* + * Very last character read is CR. + * If this is the only EOL so far, the + * EOL style is MAC. + * This is also executed if auto-eol is disabled + * but it doesn't hurt. + */ + if (ctx->eol_style < 0) + ctx->eol_style = SC_EOL_CR; + else if (ctx->eol_style != SC_EOL_CR) + ctx->eol_style_inconsistent = TRUE; + } + + return G_IO_STATUS_EOF; + + case G_IO_STATUS_NORMAL: + case G_IO_STATUS_AGAIN: + break; + } + + if (!(teco_ed & TECO_ED_AUTOEOL)) { + /* + * No EOL translation - always return entire + * buffer + */ + *data_len = ctx->block_len = ctx->read_len; + *ret = buffer; + return G_IO_STATUS_NORMAL; + } + } + + /* + * Return data with automatic EOL translation. + * Every EOL sequence is normalized to LF and + * the first sequence determines the documents + * EOL style. + * This loop is executed for every byte of the + * file/stream, so it was important to optimize + * it. Specifically, the number of returns + * is minimized by keeping a pointer to + * the beginning of a block of data in the buffer + * which already has LFs (offset). + * Mac EOLs can be converted to UNIX EOLs directly + * in the buffer. + * So if their EOLs are consistent, the function + * will return one block for the entire buffer. + * When reading a file with DOS EOLs, there will + * be one call per line which is significantly slower. + */ + for (guint i = ctx->offset; i < ctx->read_len; i++) { + switch (buffer[i]) { + case '\n': + if (ctx->last_char == '\r') { + if (ctx->eol_style < 0) + ctx->eol_style = SC_EOL_CRLF; + else if (ctx->eol_style != SC_EOL_CRLF) + ctx->eol_style_inconsistent = TRUE; + + /* + * Return block. CR has already + * been made LF in `buffer`. + */ + *data_len = ctx->block_len = i-ctx->offset; + /* next call will skip the CR */ + ctx->last_char = -1; + *ret = buffer + ctx->offset; + return G_IO_STATUS_NORMAL; + } + + if (ctx->eol_style < 0) + ctx->eol_style = SC_EOL_LF; + else if (ctx->eol_style != SC_EOL_LF) + ctx->eol_style_inconsistent = TRUE; + /* + * No conversion necessary and no need to + * return block yet. + */ + ctx->last_char = '\n'; + break; + + case '\r': + if (ctx->last_char == '\r') { + if (ctx->eol_style < 0) + ctx->eol_style = SC_EOL_CR; + else if (ctx->eol_style != SC_EOL_CR) + ctx->eol_style_inconsistent = TRUE; + } + + /* + * Convert CR to LF in `buffer`. + * This way more than one line using + * Mac EOLs can be returned at once. + */ + buffer[i] = '\n'; + ctx->last_char = '\r'; + break; + + default: + if (ctx->last_char == '\r') { + if (ctx->eol_style < 0) + ctx->eol_style = SC_EOL_CR; + else if (ctx->eol_style != SC_EOL_CR) + ctx->eol_style_inconsistent = TRUE; + } + ctx->last_char = buffer[i]; + break; + } + } + + /* + * Return remaining block. + * With UNIX/MAC EOLs, this will usually be the + * entire `buffer` + */ + *data_len = ctx->block_len = ctx->read_len-ctx->offset; + *ret = buffer + ctx->offset; + return G_IO_STATUS_NORMAL; +} + +/** @memberof teco_eol_reader_t */ +GIOStatus +teco_eol_reader_convert_all(teco_eol_reader_t *ctx, gchar **ret, gsize *out_len, GError **error) +{ + gsize buffer_len = ctx->read_cb == teco_eol_reader_read_gio + ? sizeof(ctx->gio.buffer) : ctx->mem.len; + + /* + * NOTE: Doesn't use teco_string_t to make use of GString's + * preallocation feature. + */ + GString *str = g_string_sized_new(buffer_len); + + for (;;) { + gchar *data; + gsize data_len; + + GIOStatus rc = teco_eol_reader_convert(ctx, &data, &data_len, error); + if (rc == G_IO_STATUS_ERROR) { + g_string_free(str, TRUE); + return G_IO_STATUS_ERROR; + } + if (rc == G_IO_STATUS_EOF) + break; + + g_string_append_len(str, data, data_len); + } + + if (out_len) + *out_len = str->len; + *ret = g_string_free(str, FALSE); + return G_IO_STATUS_NORMAL; +} + +/** @memberof teco_eol_reader_t */ +void +teco_eol_reader_clear(teco_eol_reader_t *ctx) +{ + if (ctx->read_cb == teco_eol_reader_read_gio && ctx->gio.channel) + g_io_channel_unref(ctx->gio.channel); +} + +static inline void +teco_eol_writer_init(teco_eol_writer_t *ctx, gint eol_mode) +{ + memset(ctx, 0, sizeof(*ctx)); + ctx->eol_seq = teco_eol_get_seq(eol_mode); + ctx->eol_seq_len = strlen(ctx->eol_seq); +} + +static gssize +teco_eol_writer_write_gio(teco_eol_writer_t *ctx, const gchar *buffer, gsize buffer_len, GError **error) +{ + gsize bytes_written; + + switch (g_io_channel_write_chars(ctx->gio.channel, buffer, buffer_len, + &bytes_written, error)) { + case G_IO_STATUS_ERROR: + return -1; + case G_IO_STATUS_EOF: + case G_IO_STATUS_NORMAL: + case G_IO_STATUS_AGAIN: + break; + } + + return bytes_written; +} + +/** @memberof teco_eol_writer_t */ +void +teco_eol_writer_init_gio(teco_eol_writer_t *ctx, gint eol_mode, GIOChannel *channel) +{ + teco_eol_writer_init(ctx, eol_mode); + ctx->write_cb = teco_eol_writer_write_gio; + teco_eol_writer_set_channel(ctx, channel); +} + +static gssize +teco_eol_writer_write_mem(teco_eol_writer_t *ctx, const gchar *buffer, gsize buffer_len, GError **error) +{ + g_string_append_len(ctx->mem.str, buffer, buffer_len); + return buffer_len; +} + +/** + * @note Currently uses GString instead of teco_string_t to allow making use + * of preallocation. + * On the other hand GString has a higher overhead. + * + * @memberof teco_eol_writer_t + */ +void +teco_eol_writer_init_mem(teco_eol_writer_t *ctx, gint eol_mode, GString *str) +{ + teco_eol_writer_init(ctx, eol_mode); + ctx->write_cb = teco_eol_writer_write_mem; + ctx->mem.str = str; +} + +/** + * Perform EOL-normalization on a buffer (if enabled) and + * pass it to the underlying data sink. + * + * This can be called repeatedly to transform a larger + * document - the buffer provided does not have to be + * well-formed with regard to EOL sequences. + * + * @param ctx The EOL Reader object. + * @param buffer The buffer to convert. + * @param buffer_len The length of the data in buffer. + * @param error A GError. + * @return The number of bytes consumed/converted from buffer. + * A value smaller than 0 is returned in case of errors. + * + * @memberof teco_eol_writer_t + */ +gssize +teco_eol_writer_convert(teco_eol_writer_t *ctx, const gchar *buffer, gsize buffer_len, GError **error) +{ + if (!(teco_ed & TECO_ED_AUTOEOL)) + /* + * Write without EOL-translation: + * `state` is not required + * NOTE: This throws in case of errors + */ + return ctx->write_cb(ctx, buffer, buffer_len, error); + + /* + * Write to stream with EOL-translation. + * The document's EOL mode tells us what was guessed + * when its content was read in (presumably from a file) + * but might have been changed manually by the user. + * NOTE: This code assumes that the output stream is + * buffered, since otherwise it would be slower + * (has been benchmarked). + * NOTE: The loop is executed for every character + * in `buffer` and has been optimized for minimal + * function (i.e. GIOChannel) calls. + */ + guint i = 0; + gsize bytes_written = 0; + if (ctx->state == TECO_EOL_STATE_WRITE_LF) { + /* complete writing a CRLF sequence */ + gssize rc = ctx->write_cb(ctx, "\n", 1, error); + if (rc < 1) + /* nothing written or error */ + return rc; + ctx->state = TECO_EOL_STATE_START; + bytes_written++; + i++; + } + + guint block_start = i; + gssize block_written; + while (i < buffer_len) { + switch (buffer[i]) { + case '\n': + if (ctx->last_c == '\r') { + /* EOL sequence already written */ + bytes_written++; + block_start = i+1; + break; + } + /* fall through */ + case '\r': + block_written = ctx->write_cb(ctx, buffer+block_start, i-block_start, error); + if (block_written < 0) + return -1; + bytes_written += block_written; + if (block_written < i-block_start) + return bytes_written; + + block_written = ctx->write_cb(ctx, ctx->eol_seq, ctx->eol_seq_len, error); + if (block_written < 0) + return -1; + if (block_written == 0) + return bytes_written; + if (block_written < ctx->eol_seq_len) { + /* incomplete EOL seq - we have written CR of CRLF */ + ctx->state = TECO_EOL_STATE_WRITE_LF; + return bytes_written; + } + bytes_written++; + + block_start = i+1; + break; + } + + ctx->last_c = buffer[i++]; + } + + /* + * Write out remaining block (i.e. line) + */ + gssize rc = ctx->write_cb(ctx, buffer+block_start, buffer_len-block_start, error); + return rc < 0 ? -1 : bytes_written + rc; +} + +/** @memberof teco_eol_writer_t */ +void +teco_eol_writer_clear(teco_eol_writer_t *ctx) +{ + if (ctx->write_cb == teco_eol_writer_write_gio && ctx->gio.channel) + g_io_channel_unref(ctx->gio.channel); +} |