1 files changed, 360 insertions, 0 deletions
diff --git a/src/eol.cpp b/src/eol.cpp
new file mode 100644
index 0000000..3b42547
--- /dev/null
+++ b/src/eol.cpp
@@ -0,0 +1,360 @@
+/*
+ * Copyright (C) 2012-2016 Robin Haberkorn
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <glib.h>
+
+#include "sciteco.h"
+#include "error.h"
+#include "eol.h"
+
+namespace SciTECO {
+
+/**
+ * Read data with automatic EOL translation.
+ *
+ * This gets the next data block from the converter
+ * implementation, performs EOL translation (if enabled)
+ * in a more or less efficient manner and returns
+ * a chunk of EOL-normalized data.
+ *
+ * Since the underlying data source may have to be
+ * queried repeatedly and because EOLReader avoids
+ * reassembling the EOL-normalized data by returning
+ * references into the modified data source, it is
+ * necessary to call this function repeatedly until
+ * it returns NULL.
+ *
+ * Errors reading the data source are propagated
+ * (as exceptions).
+ *
+ * @param data_len The length of the data chunk returned
+ *                 by this function. Set on return.
+ * @return A pointer to a chunk of EOL-normalized
+ *         data of length data_len.
+ *         It is NOT null-terminated.
+ *         NULL is returned when all data has been converted.
+ */
+const gchar *
+EOLReader::convert(gsize &data_len)
+{
+	if (last_char < 0) {
+		/* a CRLF was last translated */
+		block_len++;
+		last_char = '\n';
+	}
+	offset += block_len;
+
+	if (offset == read_len) {
+		offset = 0;
+
+		/*
+		 * NOTE: This throws in case of errors
+		 */
+		if (!this->read(buffer, read_len)) {
+			/* EOF */
+			if (last_char == '\r') {
+				/*
+				 * Very last character read is CR.
+				 * If this is the only EOL so far, the
+				 * EOL style is MAC.
+				 * This is also executed if auto-eol is disabled
+				 * but it doesn't hurt.
+				 */
+				if (eol_style < 0)
+					eol_style = SC_EOL_CR;
+				else if (eol_style != SC_EOL_CR)
+					eol_style_inconsistent = TRUE;
+			}
+
+			return NULL;
+		}
+
+		if (!(Flags::ed & Flags::ED_AUTOEOL)) {
+			/*
+			 * No EOL translation - always return entire
+			 * buffer
+			 */
+			data_len = block_len = read_len;
+			return buffer;
+		}
+	}
+
+	/*
+	 * Return data with automatic EOL translation.
+	 * Every EOL sequence is normalized to LF and
+	 * the first sequence determines the documents
+	 * EOL style.
+	 * This loop is executed for every byte of the
+	 * file/stream, so it was important to optimize
+	 * it. Specifically, the number of returns
+	 * is minimized by keeping a pointer to
+	 * the beginning of a block of data in the buffer
+	 * which already has LFs (offset).
+	 * Mac EOLs can be converted to UNIX EOLs directly
+	 * in the buffer.
+	 * So if their EOLs are consistent, the function
+	 * will return one block for the entire buffer.
+	 * When reading a file with DOS EOLs, there will
+	 * be one call per line which is significantly slower.
+	 */
+	for (guint i = offset; i < read_len; i++) {
+		switch (buffer[i]) {
+		case '\n':
+			if (last_char == '\r') {
+				if (eol_style < 0)
+					eol_style = SC_EOL_CRLF;
+				else if (eol_style != SC_EOL_CRLF)
+					eol_style_inconsistent = TRUE;
+
+				/*
+				 * Return block. CR has already
+				 * been made LF in `buffer`.
+				 */
+				data_len = block_len = i-offset;
+				/* next call will skip the CR */
+				last_char = -1;
+				return buffer + offset;
+			}
+
+			if (eol_style < 0)
+				eol_style = SC_EOL_LF;
+			else if (eol_style != SC_EOL_LF)
+				eol_style_inconsistent = TRUE;
+			/*
+			 * No conversion necessary and no need to
+			 * return block yet.
+			 */
+			last_char = '\n';
+			break;
+
+		case '\r':
+			if (last_char == '\r') {
+				if (eol_style < 0)
+					eol_style = SC_EOL_CR;
+				else if (eol_style != SC_EOL_CR)
+					eol_style_inconsistent = TRUE;
+			}
+
+			/*
+			 * Convert CR to LF in `buffer`.
+			 * This way more than one line using
+			 * Mac EOLs can be returned at once.
+			 */
+			buffer[i] = '\n';
+			last_char = '\r';
+			break;
+
+		default:
+			if (last_char == '\r') {
+				if (eol_style < 0)
+					eol_style = SC_EOL_CR;
+				else if (eol_style != SC_EOL_CR)
+					eol_style_inconsistent = TRUE;
+			}
+			last_char = buffer[i];
+			break;
+		}
+	}
+
+	/*
+	 * Return remaining block.
+	 * With UNIX/MAC EOLs, this will usually be the
+	 * entire `buffer`
+	 */
+	data_len = block_len = read_len-offset;
+	return buffer + offset;
+}
+
+bool
+EOLReaderGIO::read(gchar *buffer, gsize &read_len)
+{
+	GError *error = NULL;
+
+	switch (g_io_channel_read_chars(channel, buffer,
+	                                sizeof(EOLReaderGIO::buffer),
+	                                &read_len, &error)) {
+	case G_IO_STATUS_ERROR:
+		throw GlibError(error);
+	case G_IO_STATUS_EOF:
+		return false;
+	case G_IO_STATUS_NORMAL:
+	case G_IO_STATUS_AGAIN:
+		break;
+	}
+
+	return true;
+}
+
+bool
+EOLReaderMem::read(gchar *buffer, gsize &read_len)
+{
+	read_len = buffer_len;
+	buffer_len = 0;
+	/*
+	 * On the first call, returns true,
+	 * later false (no more data).
+	 */
+	return read_len != 0;
+}
+
+/*
+ * This could be in EOLReader as well, but this way, we
+ * make use of the buffer_len to avoid unnecessary allocations.
+ */
+gchar *
+EOLReaderMem::convert_all(gsize *out_len)
+{
+	GString *str = g_string_sized_new(buffer_len);
+	const gchar *data;
+	gsize data_len;
+
+	try {
+		while ((data = convert(data_len)))
+			g_string_append_len(str, data, data_len);
+	} catch (...) {
+		g_string_free(str, TRUE);
+		throw; /* forward */
+	}
+
+	if (out_len)
+		*out_len = str->len;
+	return g_string_free(str, FALSE);
+}
+
+/**
+ * Perform EOL-normalization on a buffer (if enabled) and
+ * pass it to the underlying data sink.
+ *
+ * This can be called repeatedly to transform a larger
+ * document - the buffer provided does not have to be
+ * well-formed with regard to EOL sequences.
+ *
+ * @param buffer The buffer to convert.
+ * @parem buffer_len The length of the data in buffer.
+ * @return The number of bytes written to the data sink,
+ *         i.e. the size of the EOL-normalized data written.
+ */
+gsize
+EOLWriter::convert(const gchar *buffer, gsize buffer_len)
+{
+	gsize bytes_written;
+	guint i = 0;
+	guint block_start;
+	gsize block_written;
+
+	if (!(Flags::ed & Flags::ED_AUTOEOL))
+		/*
+		 * Write without EOL-translation:
+		 * `state` is not required
+		 * NOTE: This throws in case of errors
+		 */
+		return this->write(buffer, buffer_len);
+
+	/*
+	 * Write to stream with EOL-translation.
+	 * The document's EOL mode tells us what was guessed
+	 * when its content was read in (presumably from a file)
+	 * but might have been changed manually by the user.
+	 * NOTE: This code assumes that the output stream is
+	 * buffered, since otherwise it would be slower
+	 * (has been benchmarked).
+	 * NOTE: The loop is executed for every character
+	 * in `buffer` and has been optimized for minimal
+	 * function (i.e. GIOChannel) calls.
+	 */
+	bytes_written = 0;
+	if (state == STATE_WRITE_LF) {
+		/* complete writing a CRLF sequence */
+		if (this->write("\n", 1) < 1)
+			return 0;
+		state = STATE_START;
+		bytes_written++;
+		i++;
+	}
+
+	block_start = i;
+	while (i < buffer_len) {
+		switch (buffer[i]) {
+		case '\n':
+			if (last_c == '\r') {
+				/* EOL sequence already written */
+				bytes_written++;
+				block_start = i+1;
+				break;
+			}
+			/* fall through */
+		case '\r':
+			block_written = this->write(buffer+block_start, i-block_start);
+			bytes_written += block_written;
+			if (block_written < i-block_start)
+				return bytes_written;
+
+			block_written = this->write(eol_seq, eol_seq_len);
+			if (block_written == 0)
+				return bytes_written;
+			if (block_written < eol_seq_len) {
+				/* incomplete EOL seq - we have written CR of CRLF */
+				state = STATE_WRITE_LF;
+				return bytes_written;
+			}
+			bytes_written++;
+
+			block_start = i+1;
+			break;
+		}
+
+		last_c = buffer[i++];
+	}
+
+	/*
+	 * Write out remaining block (i.e. line)
+	 */
+	bytes_written += this->write(buffer+block_start, buffer_len-block_start);
+	return bytes_written;
+}
+
+gsize
+EOLWriterGIO::write(const gchar *buffer, gsize buffer_len)
+{
+	gsize bytes_written;
+	GError *error = NULL;
+
+	switch (g_io_channel_write_chars(channel, buffer, buffer_len,
+	                                 &bytes_written, &error)) {
+	case G_IO_STATUS_ERROR:
+		throw GlibError(error);
+	case G_IO_STATUS_EOF:
+	case G_IO_STATUS_NORMAL:
+	case G_IO_STATUS_AGAIN:
+		break;
+	}
+
+	return bytes_written;
+}
+
+gsize
+EOLWriterMem::write(const gchar *buffer, gsize buffer_len)
+{
+	g_string_append_len(str, buffer, buffer_len);
+	return buffer_len;
+}
+
+} /* namespace SciTECO */