/* * Copyright (C) 2012-2024 Robin Haberkorn * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include "sciteco.h" #include "eol.h" const gchar * teco_eol_get_seq(gint eol_mode) { switch (eol_mode) { case SC_EOL_CRLF: return "\r\n"; case SC_EOL_CR: return "\r"; case SC_EOL_LF: default: return "\n"; } } static inline void teco_eol_reader_init(teco_eol_reader_t *ctx) { memset(ctx, 0, sizeof(*ctx)); ctx->eol_style = -1; } static GIOStatus teco_eol_reader_read_gio(teco_eol_reader_t *ctx, gsize *read_len, GError **error) { return g_io_channel_read_chars(ctx->gio.channel, ctx->gio.buffer, sizeof(ctx->gio.buffer), read_len, error); } /** @memberof teco_eol_reader_t */ void teco_eol_reader_init_gio(teco_eol_reader_t *ctx, GIOChannel *channel) { teco_eol_reader_init(ctx); ctx->read_cb = teco_eol_reader_read_gio; teco_eol_reader_set_channel(ctx, channel); } static GIOStatus teco_eol_reader_read_mem(teco_eol_reader_t *ctx, gsize *read_len, GError **error) { *read_len = ctx->mem.len; ctx->mem.len = 0; /* * On the first call, returns G_IO_STATUS_NORMAL, * later G_IO_STATUS_EOF. */ return *read_len != 0 ? G_IO_STATUS_NORMAL : G_IO_STATUS_EOF; } /** @memberof teco_eol_reader_t */ void teco_eol_reader_init_mem(teco_eol_reader_t *ctx, gchar *buffer, gsize len) { teco_eol_reader_init(ctx); ctx->read_cb = teco_eol_reader_read_mem; ctx->mem.buffer = buffer; ctx->mem.len = len; } /** * Read data with automatic EOL translation. * * This gets the next data block from the converter * implementation, performs EOL translation (if enabled) * in a more or less efficient manner and returns * a chunk of EOL-normalized data. * * Since the underlying data source may have to be * queried repeatedly and because the EOL Reader avoids * reassembling the EOL-normalized data by returning * references into the modified data source, it is * necessary to call this function repeatedly until * it returns G_IO_STATUS_EOF. * * @param ctx The EOL Reader object. * @param ret Location to store a pointer to the converted chunk. * The EOL-converted data is NOT null-terminated. * @param data_len A pointer to the length of the converted chunk. * @param error A GError. * @return The status of the conversion. * * @memberof teco_eol_reader_t */ GIOStatus teco_eol_reader_convert(teco_eol_reader_t *ctx, gchar **ret, gsize *data_len, GError **error) { gchar *buffer = ctx->read_cb == teco_eol_reader_read_gio ? ctx->gio.buffer : ctx->mem.buffer; if (ctx->last_char < 0) { /* a CRLF was last translated */ ctx->block_len++; ctx->last_char = '\n'; } ctx->offset += ctx->block_len; if (ctx->offset >= ctx->read_len) { ctx->offset = 0; switch (ctx->read_cb(ctx, &ctx->read_len, error)) { case G_IO_STATUS_ERROR: return G_IO_STATUS_ERROR; case G_IO_STATUS_EOF: if (ctx->last_char == '\r') { /* * Very last character read is CR. * If this is the only EOL so far, the * EOL style is MAC. * This is also executed if auto-eol is disabled * but it doesn't hurt. */ if (ctx->eol_style < 0) ctx->eol_style = SC_EOL_CR; else if (ctx->eol_style != SC_EOL_CR) ctx->eol_style_inconsistent = TRUE; } return G_IO_STATUS_EOF; case G_IO_STATUS_NORMAL: case G_IO_STATUS_AGAIN: break; } if (!(teco_ed & TECO_ED_AUTOEOL)) { /* * No EOL translation - always return entire * buffer */ *data_len = ctx->block_len = ctx->read_len; *ret = buffer; return G_IO_STATUS_NORMAL; } } /* * Return data with automatic EOL translation. * Every EOL sequence is normalized to LF and * the first sequence determines the documents * EOL style. * This loop is executed for every byte of the * file/stream, so it was important to optimize * it. Specifically, the number of returns * is minimized by keeping a pointer to * the beginning of a block of data in the buffer * which already has LFs (offset). * Mac EOLs can be converted to UNIX EOLs directly * in the buffer. * So if their EOLs are consistent, the function * will return one block for the entire buffer. * When reading a file with DOS EOLs, there will * be one call per line which is significantly slower. */ for (guint i = ctx->offset; i < ctx->read_len; i++) { switch (buffer[i]) { case '\n': if (ctx->last_char == '\r') { if (ctx->eol_style < 0) ctx->eol_style = SC_EOL_CRLF; else if (ctx->eol_style != SC_EOL_CRLF) ctx->eol_style_inconsistent = TRUE; /* * Return block. CR has already * been made LF in `buffer`. */ *data_len = ctx->block_len = i-ctx->offset; /* next call will skip the CR */ ctx->last_char = -1; *ret = buffer + ctx->offset; return G_IO_STATUS_NORMAL; } if (ctx->eol_style < 0) ctx->eol_style = SC_EOL_LF; else if (ctx->eol_style != SC_EOL_LF) ctx->eol_style_inconsistent = TRUE; /* * No conversion necessary and no need to * return block yet. */ ctx->last_char = '\n'; break; case '\r': if (ctx->last_char == '\r') { if (ctx->eol_style < 0) ctx->eol_style = SC_EOL_CR; else if (ctx->eol_style != SC_EOL_CR) ctx->eol_style_inconsistent = TRUE; } /* * Convert CR to LF in `buffer`. * This way more than one line using * Mac EOLs can be returned at once. */ buffer[i] = '\n'; ctx->last_char = '\r'; break; default: if (ctx->last_char == '\r') { if (ctx->eol_style < 0) ctx->eol_style = SC_EOL_CR; else if (ctx->eol_style != SC_EOL_CR) ctx->eol_style_inconsistent = TRUE; } ctx->last_char = (guchar)buffer[i]; break; } } /* * Return remaining block. * With UNIX/MAC EOLs, this will usually be the * entire `buffer` */ *data_len = ctx->block_len = ctx->read_len-ctx->offset; *ret = buffer + ctx->offset; return G_IO_STATUS_NORMAL; } /** @memberof teco_eol_reader_t */ GIOStatus teco_eol_reader_convert_all(teco_eol_reader_t *ctx, gchar **ret, gsize *out_len, GError **error) { gsize buffer_len = ctx->read_cb == teco_eol_reader_read_gio ? sizeof(ctx->gio.buffer) : ctx->mem.len; /* * NOTE: Doesn't use teco_string_t to make use of GString's * preallocation feature. */ GString *str = g_string_sized_new(buffer_len); for (;;) { gchar *data; gsize data_len; GIOStatus rc = teco_eol_reader_convert(ctx, &data, &data_len, error); if (rc == G_IO_STATUS_ERROR) { g_string_free(str, TRUE); return G_IO_STATUS_ERROR; } if (rc == G_IO_STATUS_EOF) break; g_string_append_len(str, data, data_len); } if (out_len) *out_len = str->len; *ret = g_string_free(str, FALSE); return G_IO_STATUS_NORMAL; } /** @memberof teco_eol_reader_t */ void teco_eol_reader_clear(teco_eol_reader_t *ctx) { if (ctx->read_cb == teco_eol_reader_read_gio && ctx->gio.channel) g_io_channel_unref(ctx->gio.channel); } static inline void teco_eol_writer_init(teco_eol_writer_t *ctx, gint eol_mode) { memset(ctx, 0, sizeof(*ctx)); ctx->eol_seq = teco_eol_get_seq(eol_mode); ctx->eol_seq_len = strlen(ctx->eol_seq); } static gssize teco_eol_writer_write_gio(teco_eol_writer_t *ctx, const gchar *buffer, gsize buffer_len, GError **error) { gsize bytes_written; switch (g_io_channel_write_chars(ctx->gio.channel, buffer, buffer_len, &bytes_written, error)) { case G_IO_STATUS_ERROR: return -1; case G_IO_STATUS_EOF: case G_IO_STATUS_NORMAL: case G_IO_STATUS_AGAIN: break; } return bytes_written; } /** @memberof teco_eol_writer_t */ void teco_eol_writer_init_gio(teco_eol_writer_t *ctx, gint eol_mode, GIOChannel *channel) { teco_eol_writer_init(ctx, eol_mode); ctx->write_cb = teco_eol_writer_write_gio; teco_eol_writer_set_channel(ctx, channel); } static gssize teco_eol_writer_write_mem(teco_eol_writer_t *ctx, const gchar *buffer, gsize buffer_len, GError **error) { g_string_append_len(ctx->mem.str, buffer, buffer_len); return buffer_len; } /** * @note Currently uses GString instead of teco_string_t to allow making use * of preallocation. * On the other hand GString has a higher overhead. * * @memberof teco_eol_writer_t */ void teco_eol_writer_init_mem(teco_eol_writer_t *ctx, gint eol_mode, GString *str) { teco_eol_writer_init(ctx, eol_mode); ctx->write_cb = teco_eol_writer_write_mem; ctx->mem.str = str; } /** * Perform EOL-normalization on a buffer (if enabled) and * pass it to the underlying data sink. * * This can be called repeatedly to transform a larger * document - the buffer provided does not have to be * well-formed with regard to EOL sequences. * * @param ctx The EOL Reader object. * @param buffer The buffer to convert. * @param buffer_len The length of the data in buffer. * @param error A GError. * @return The number of bytes consumed/converted from buffer. * A value smaller than 0 is returned in case of errors. * * @memberof teco_eol_writer_t */ gssize teco_eol_writer_convert(teco_eol_writer_t *ctx, const gchar *buffer, gsize buffer_len, GError **error) { if (!(teco_ed & TECO_ED_AUTOEOL)) /* * Write without EOL-translation: * `state` is not required * NOTE: This throws in case of errors */ return ctx->write_cb(ctx, buffer, buffer_len, error); /* * Write to stream with EOL-translation. * The document's EOL mode tells us what was guessed * when its content was read in (presumably from a file) * but might have been changed manually by the user. * NOTE: This code assumes that the output stream is * buffered, since otherwise it would be slower * (has been benchmarked). * NOTE: The loop is executed for every character * in `buffer` and has been optimized for minimal * function (i.e. GIOChannel) calls. */ guint i = 0; gsize bytes_written = 0; if (ctx->state == TECO_EOL_STATE_WRITE_LF) { /* complete writing a CRLF sequence */ gssize rc = ctx->write_cb(ctx, "\n", 1, error); if (rc < 1) /* nothing written or error */ return rc; ctx->state = TECO_EOL_STATE_START; bytes_written++; i++; } guint block_start = i; gssize block_written; while (i < buffer_len) { switch (buffer[i]) { case '\n': if (ctx->last_c == '\r') { /* EOL sequence already written */ bytes_written++; block_start = i+1; break; } /* fall through */ case '\r': block_written = ctx->write_cb(ctx, buffer+block_start, i-block_start, error); if (block_written < 0) return -1; bytes_written += block_written; if (block_written < i-block_start) return bytes_written; block_written = ctx->write_cb(ctx, ctx->eol_seq, ctx->eol_seq_len, error); if (block_written < 0) return -1; if (block_written == 0) return bytes_written; if (block_written < ctx->eol_seq_len) { /* incomplete EOL seq - we have written CR of CRLF */ ctx->state = TECO_EOL_STATE_WRITE_LF; return bytes_written; } bytes_written++; block_start = i+1; break; } ctx->last_c = buffer[i++]; } /* * Write out remaining block (i.e. line) */ gssize rc = ctx->write_cb(ctx, buffer+block_start, buffer_len-block_start, error); return rc < 0 ? -1 : bytes_written + rc; } /** @memberof teco_eol_writer_t */ void teco_eol_writer_clear(teco_eol_writer_t *ctx) { if (ctx->write_cb == teco_eol_writer_write_gio && ctx->gio.channel) g_io_channel_unref(ctx->gio.channel); }