3 files changed, 102 insertions, 90 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index 412798def..48913a16c 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -116,8 +116,6 @@ Document::Document(int options) :
 
 	matchesValid = false;
 
-	UTF8BytesOfLeadInitialise();
-
 	perLineData[ldMarkers] = std::make_unique<LineMarkers>();
 	perLineData[ldLevels] = std::make_unique<LineLevels>();
 	perLineData[ldState] = std::make_unique<LineState>();
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx
index 8e537c689..19b968932 100644
--- a/src/UniConversion.cxx
+++ b/src/UniConversion.cxx
@@ -65,35 +65,15 @@ void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len) {
 		putf[k] = '\0';
 }
 
-unsigned int UTF8CharLength(unsigned char ch) {
-	if (ch < 0x80) {
-		return 1;
-	} else if (ch < 0x80 + 0x40 + 0x20) {
-		return 2;
-	} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
-		return 3;
-	} else {
-		return 4;
-	}
-}
-
 size_t UTF16Length(const char *s, size_t len) {
 	size_t ulen = 0;
-	size_t charLen;
-	for (size_t i = 0; i<len;) {
-		const unsigned char ch = static_cast<unsigned char>(s[i]);
-		if (ch < 0x80) {
-			charLen = 1;
-		} else if (ch < 0x80 + 0x40 + 0x20) {
-			charLen = 2;
-		} else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
-			charLen = 3;
-		} else {
-			charLen = 4;
-			ulen++;
-		}
-		i += charLen;
-		ulen++;
+	const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
+	for (size_t i = 0; i < len;) {
+		const unsigned char ch = us[i];
+		const unsigned int byteCount = UTF8BytesOfLead[ch];
+		const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount);
+		i += byteCount;
+		ulen += (i > len) ? 1 : utf16Len;
 	}
 	return ulen;
 }
@@ -104,39 +84,60 @@ constexpr unsigned char TrailByteValue(unsigned char c) {
 	return c & 0b0011'1111;
 }
 
-const unsigned char utf8Start3 = 0b1110'0000;
-const unsigned char utf8Start4 = 0b1111'0000;
-
 size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
 	size_t ui = 0;
 	const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
-	size_t i = 0;
-	while ((i<len) && (ui<tlen)) {
-		unsigned char ch = us[i++];
-		if (ch < 0x80) {
+	for (size_t i = 0; i < len;) {
+		unsigned char ch = us[i];
+		const unsigned int byteCount = UTF8BytesOfLead[ch];
+		unsigned int value;
+
+		if (i + byteCount > len) {
+			// Trying to read past end but still have space to write
+			if (ui < tlen) {
+				tbuf[ui] = ch;
+				ui++;
+			}
+			break;
+		}
+
+		const size_t outLen = (byteCount < 4) ? 1 : 2;
+		if (ui + outLen > tlen) {
+			throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
+		}
+
+		i++;
+		switch (byteCount) {
+		case 1:
 			tbuf[ui] = ch;
-		} else if (ch < utf8Start3) {
-			tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);
+			break;
+		case 2:
+			value = (ch & 0x1F) << 6;
 			ch = us[i++];
-			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + TrailByteValue(ch));
-		} else if (ch < utf8Start4) {
-			tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);
+			value += TrailByteValue(ch);
+			tbuf[ui] = static_cast<wchar_t>(value);
+			break;
+		case 3:
+			value = (ch & 0xF) << 12;
 			ch = us[i++];
-			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (TrailByteValue(ch) << 6));
+			value += (TrailByteValue(ch) << 6);
 			ch = us[i++];
-			tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + TrailByteValue(ch));
-		} else {
+			value += TrailByteValue(ch);
+			tbuf[ui] = static_cast<wchar_t>(value);
+			break;
+		default:
 			// Outside the BMP so need two surrogates
-			int val = (ch & 0x7) << 18;
+			value = (ch & 0x7) << 18;
 			ch = us[i++];
-			val += TrailByteValue(ch) << 12;
+			value += TrailByteValue(ch) << 12;
 			ch = us[i++];
-			val += TrailByteValue(ch) << 6;
+			value += TrailByteValue(ch) << 6;
 			ch = us[i++];
-			val += TrailByteValue(ch);
-			tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
+			value += TrailByteValue(ch);
+			tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
 			ui++;
-			tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
+			tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST);
+			break;
 		}
 		ui++;
 	}
@@ -144,25 +145,44 @@ size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
 }
 
 size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) {
-	size_t ui=0;
+	size_t ui = 0;
 	const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
-	size_t i=0;
-	while ((i<len) && (ui<tlen)) {
-		unsigned char ch = us[i++];
-		unsigned int value = 0;
-		if (ch < 0x80) {
+	for (size_t i = 0; i < len;) {
+		unsigned char ch = us[i];
+		const unsigned int byteCount = UTF8BytesOfLead[ch];
+		unsigned int value;
+
+		if (i + byteCount > len) {
+			// Trying to read past end but still have space to write
+			if (ui < tlen) {
+				tbuf[ui] = ch;
+				ui++;
+			}
+			break;
+		}
+
+		if (ui == tlen) {
+			throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
+		}
+
+		i++;
+		switch (byteCount) {
+		case 1:
 			value = ch;
-		} else if (((len-i) >= 1) && (ch < utf8Start3)) {
+			break;
+		case 2:
 			value = (ch & 0x1F) << 6;
 			ch = us[i++];
 			value += TrailByteValue(ch);
-		} else if (((len-i) >= 2) && (ch < utf8Start4)) {
+			break;
+		case 3:
 			value = (ch & 0xF) << 12;
 			ch = us[i++];
 			value += TrailByteValue(ch) << 6;
 			ch = us[i++];
 			value += TrailByteValue(ch);
-		} else if ((len-i) >= 3) {
+			break;
+		default:
 			value = (ch & 0x7) << 18;
 			ch = us[i++];
 			value += TrailByteValue(ch) << 12;
@@ -170,6 +190,7 @@ size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen)
 			value += TrailByteValue(ch) << 6;
 			ch = us[i++];
 			value += TrailByteValue(ch);
+			break;
 		}
 		tbuf[ui] = value;
 		ui++;
@@ -188,33 +209,24 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {
 	}
 }
 
-int UTF8BytesOfLead[256];
-static bool initialisedBytesOfLead = false;
-
-static int BytesFromLead(int leadByte) {
-	if (leadByte < 0xC2) {
-		// Single byte or invalid
-		return 1;
-	} else if (leadByte < 0xE0) {
-		return 2;
-	} else if (leadByte < 0xF0) {
-		return 3;
-	} else if (leadByte < 0xF5) {
-		return 4;
-	} else {
-		// Characters longer than 4 bytes not possible in current UTF-8
-		return 1;
-	}
-}
-
-void UTF8BytesOfLeadInitialise() {
-	if (!initialisedBytesOfLead) {
-		for (int i=0; i<256; i++) {
-			UTF8BytesOfLead[i] = BytesFromLead(i);
-		}
-		initialisedBytesOfLead = true;
-	}
-}
+const unsigned char UTF8BytesOfLead[256] = {
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
+1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
+2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
+3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
+4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
+};
 
 // Return both the width of the first character in the string and a status
 // saying whether it is valid or invalid.
diff --git a/src/UniConversion.h b/src/UniConversion.h
index 2f358c9c5..0f22c06e6 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -16,17 +16,15 @@ const int unicodeReplacementChar = 0xFFFD;
 
 size_t UTF8Length(const wchar_t *uptr, size_t tlen);
 void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len);
-unsigned int UTF8CharLength(unsigned char ch);
 size_t UTF16Length(const char *s, size_t len);
 size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen);
 size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen);
 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf);
 std::string FixInvalidUTF8(const std::string &text);
 
-extern int UTF8BytesOfLead[256];
-void UTF8BytesOfLeadInitialise();
+extern const unsigned char UTF8BytesOfLead[256];
 
-inline bool UTF8IsTrailByte(int ch) {
+inline bool UTF8IsTrailByte(unsigned char ch) {
 	return (ch >= 0x80) && (ch < 0xc0);
 }
 
@@ -64,6 +62,10 @@ inline unsigned int UTF16CharLength(wchar_t uch) {
 	return ((uch >= SURROGATE_LEAD_FIRST) && (uch <= SURROGATE_LEAD_LAST)) ? 2 : 1;
 }
 
+inline unsigned int UTF16LengthFromUTF8ByteCount(unsigned int byteCount) {
+    return (byteCount < 4) ? 1 : 2;
+}
+
 }
 
 #endif