From ce9ec5366ab200193140bbd060b948f62a10286b Mon Sep 17 00:00:00 2001
From: Neil <nyamatongwe@gmail.com>
Date: Wed, 28 Feb 2024 11:44:50 +1100
Subject: Add variant of UTF8Classify that takes a char* so that client code
 does not have to reinterpret_cast. Make functions in header constexpr. Prefer
 .data() to &[] since safer. Avoid else when not needed.

---
 src/Document.cxx      |  2 +-
 src/UniConversion.cxx | 32 +++++++++++++++++---------------
 src/UniConversion.h   |  7 ++++---
 3 files changed, 22 insertions(+), 19 deletions(-)
diff --git a/src/Document.cxx b/src/Document.cxx
index 5f77ec2de..df4e570d0 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -2235,7 +2235,7 @@ Sci::Position Document::FindText(Sci::Position minPos, Sci::Position maxPos, con
 						for (int b = 1; b < widthCharBytes; b++) {
 							bytes[b] = cbView.CharAt(posIndexDocument + b);
 						}
-						widthChar = UTF8Classify(reinterpret_cast<const unsigned char *>(bytes), widthCharBytes) & UTF8MaskWidth;
+						widthChar = UTF8Classify(bytes, widthCharBytes) & UTF8MaskWidth;
 						if (!indexSearch) {	// First character
 							widthFirstCharacter = widthChar;
 						}
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx
index 3f3bc5904..eadac8915 100644
--- a/src/UniConversion.cxx
+++ b/src/UniConversion.cxx
@@ -241,12 +241,12 @@ std::wstring WStringFromUTF8(std::string_view svu8) {
 	if constexpr (sizeof(wchar_t) == 2) {
 		const size_t len16 = UTF16Length(svu8);
 		std::wstring ws(len16, 0);
-		UTF16FromUTF8(svu8, &ws[0], len16);
+		UTF16FromUTF8(svu8, ws.data(), len16);
 		return ws;
 	} else {
 		const size_t len32 = UTF32Length(svu8);
 		std::wstring ws(len32, 0);
-		UTF32FromUTF8(svu8, reinterpret_cast<unsigned int *>(&ws[0]), len32);
+		UTF32FromUTF8(svu8, reinterpret_cast<unsigned int *>(ws.data()), len32);
 		return ws;
 	}
 }
@@ -255,11 +255,10 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept {
 	if (val < SUPPLEMENTAL_PLANE_FIRST) {
 		tbuf[0] = static_cast<wchar_t>(val);
 		return 1;
-	} else {
-		tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
-		tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
-		return 2;
 	}
+	tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
+	tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
+	return 2;
 }
 
 const unsigned char UTF8BytesOfLead[256] = {
@@ -358,25 +357,28 @@ int UTF8Classify(const unsigned char *us, size_t len) noexcept {
 	return UTF8MaskInvalid | 1;
 }
 
+int UTF8Classify(const char *s, size_t len) noexcept {
+	return UTF8Classify(reinterpret_cast<const unsigned char *>(s), len);
+}
+
 int UTF8DrawBytes(const char *s, size_t len) noexcept {
-	const int utf8StatusNext = UTF8Classify(reinterpret_cast<const unsigned char *>(s), len);
+	const int utf8StatusNext = UTF8Classify(s, len);
 	return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
 }
 
 bool UTF8IsValid(std::string_view svu8) noexcept {
-	const unsigned char *us = reinterpret_cast<const unsigned char *>(svu8.data());
+	const char *s = svu8.data();
 	size_t remaining = svu8.length();
 	while (remaining > 0) {
-		const int utf8Status = UTF8Classify(us, remaining);
+		const int utf8Status = UTF8Classify(s, remaining);
 		if (utf8Status & UTF8MaskInvalid) {
 			return false;
-		} else {
-			const int lenChar = utf8Status & UTF8MaskWidth;
-			us += lenChar;
-			remaining -= lenChar;
 		}
+		const int lenChar = utf8Status & UTF8MaskWidth;
+		s += lenChar;
+		remaining -= lenChar;
 	}
-	return remaining == 0;
+	return true;
 }
 
 // Replace invalid bytes in UTF-8 with the replacement character
@@ -385,7 +387,7 @@ std::string FixInvalidUTF8(const std::string &text) {
 	const char *s = text.c_str();
 	size_t remaining = text.size();
 	while (remaining > 0) {
-		const int utf8Status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), remaining);
+		const int utf8Status = UTF8Classify(s, remaining);
 		if (utf8Status & UTF8MaskInvalid) {
 			// Replacement character 0xFFFD = UTF8:"efbfbd".
 			result.append("\xef\xbf\xbd");
diff --git a/src/UniConversion.h b/src/UniConversion.h
index b4f4c89f6..657e3eca7 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -59,8 +59,9 @@ constexpr bool UTF8IsAscii(char ch) noexcept {
 
 enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 };
 int UTF8Classify(const unsigned char *us, size_t len) noexcept;
+int UTF8Classify(const char *s, size_t len) noexcept;
 inline int UTF8Classify(std::string_view sv) noexcept {
-	return UTF8Classify(reinterpret_cast<const unsigned char *>(sv.data()), sv.length());
+	return UTF8Classify(sv.data(), sv.length());
 }
 
 // Similar to UTF8Classify but returns a length of 1 for invalid bytes
@@ -70,13 +71,13 @@ int UTF8DrawBytes(const char *s, size_t len) noexcept;
 // Line separator is U+2028 \xe2\x80\xa8
 // Paragraph separator is U+2029 \xe2\x80\xa9
 constexpr int UTF8SeparatorLength = 3;
-inline bool UTF8IsSeparator(const unsigned char *us) noexcept {
+constexpr bool UTF8IsSeparator(const unsigned char *us) noexcept {
 	return (us[0] == 0xe2) && (us[1] == 0x80) && ((us[2] == 0xa8) || (us[2] == 0xa9));
 }
 
 // NEL is U+0085 \xc2\x85
 constexpr int UTF8NELLength = 2;
-inline bool UTF8IsNEL(const unsigned char *us) noexcept {
+constexpr bool UTF8IsNEL(const unsigned char *us) noexcept {
 	return (us[0] == 0xc2) && (us[1] == 0x85);
 }
 
-- 
cgit v1.2.3