aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorNeil <nyamatongwe@gmail.com>2024-02-28 11:44:50 +1100
committerNeil <nyamatongwe@gmail.com>2024-02-28 11:44:50 +1100
commitce9ec5366ab200193140bbd060b948f62a10286b (patch)
tree99043aca968b7a9dd7776bcacf28f626593aed5d
parentd021884ab62bb339427cb01bebb2660f47417019 (diff)
downloadscintilla-mirror-ce9ec5366ab200193140bbd060b948f62a10286b.tar.gz
Add variant of UTF8Classify that takes a char* so that client code does not have
to reinterpret_cast. Make functions in header constexpr. Prefer .data() to &[] since safer. Avoid else when not needed.
-rw-r--r--src/Document.cxx2
-rw-r--r--src/UniConversion.cxx32
-rw-r--r--src/UniConversion.h7
3 files changed, 22 insertions, 19 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index 5f77ec2de..df4e570d0 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -2235,7 +2235,7 @@ Sci::Position Document::FindText(Sci::Position minPos, Sci::Position maxPos, con
for (int b = 1; b < widthCharBytes; b++) {
bytes[b] = cbView.CharAt(posIndexDocument + b);
}
- widthChar = UTF8Classify(reinterpret_cast<const unsigned char *>(bytes), widthCharBytes) & UTF8MaskWidth;
+ widthChar = UTF8Classify(bytes, widthCharBytes) & UTF8MaskWidth;
if (!indexSearch) { // First character
widthFirstCharacter = widthChar;
}
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx
index 3f3bc5904..eadac8915 100644
--- a/src/UniConversion.cxx
+++ b/src/UniConversion.cxx
@@ -241,12 +241,12 @@ std::wstring WStringFromUTF8(std::string_view svu8) {
if constexpr (sizeof(wchar_t) == 2) {
const size_t len16 = UTF16Length(svu8);
std::wstring ws(len16, 0);
- UTF16FromUTF8(svu8, &ws[0], len16);
+ UTF16FromUTF8(svu8, ws.data(), len16);
return ws;
} else {
const size_t len32 = UTF32Length(svu8);
std::wstring ws(len32, 0);
- UTF32FromUTF8(svu8, reinterpret_cast<unsigned int *>(&ws[0]), len32);
+ UTF32FromUTF8(svu8, reinterpret_cast<unsigned int *>(ws.data()), len32);
return ws;
}
}
@@ -255,11 +255,10 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept {
if (val < SUPPLEMENTAL_PLANE_FIRST) {
tbuf[0] = static_cast<wchar_t>(val);
return 1;
- } else {
- tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
- tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
- return 2;
}
+ tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
+ tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
+ return 2;
}
const unsigned char UTF8BytesOfLead[256] = {
@@ -358,25 +357,28 @@ int UTF8Classify(const unsigned char *us, size_t len) noexcept {
return UTF8MaskInvalid | 1;
}
+int UTF8Classify(const char *s, size_t len) noexcept {
+ return UTF8Classify(reinterpret_cast<const unsigned char *>(s), len);
+}
+
int UTF8DrawBytes(const char *s, size_t len) noexcept {
- const int utf8StatusNext = UTF8Classify(reinterpret_cast<const unsigned char *>(s), len);
+ const int utf8StatusNext = UTF8Classify(s, len);
return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
}
bool UTF8IsValid(std::string_view svu8) noexcept {
- const unsigned char *us = reinterpret_cast<const unsigned char *>(svu8.data());
+ const char *s = svu8.data();
size_t remaining = svu8.length();
while (remaining > 0) {
- const int utf8Status = UTF8Classify(us, remaining);
+ const int utf8Status = UTF8Classify(s, remaining);
if (utf8Status & UTF8MaskInvalid) {
return false;
- } else {
- const int lenChar = utf8Status & UTF8MaskWidth;
- us += lenChar;
- remaining -= lenChar;
}
+ const int lenChar = utf8Status & UTF8MaskWidth;
+ s += lenChar;
+ remaining -= lenChar;
}
- return remaining == 0;
+ return true;
}
// Replace invalid bytes in UTF-8 with the replacement character
@@ -385,7 +387,7 @@ std::string FixInvalidUTF8(const std::string &text) {
const char *s = text.c_str();
size_t remaining = text.size();
while (remaining > 0) {
- const int utf8Status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), remaining);
+ const int utf8Status = UTF8Classify(s, remaining);
if (utf8Status & UTF8MaskInvalid) {
// Replacement character 0xFFFD = UTF8:"efbfbd".
result.append("\xef\xbf\xbd");
diff --git a/src/UniConversion.h b/src/UniConversion.h
index b4f4c89f6..657e3eca7 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -59,8 +59,9 @@ constexpr bool UTF8IsAscii(char ch) noexcept {
enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 };
int UTF8Classify(const unsigned char *us, size_t len) noexcept;
+int UTF8Classify(const char *s, size_t len) noexcept;
inline int UTF8Classify(std::string_view sv) noexcept {
- return UTF8Classify(reinterpret_cast<const unsigned char *>(sv.data()), sv.length());
+ return UTF8Classify(sv.data(), sv.length());
}
// Similar to UTF8Classify but returns a length of 1 for invalid bytes
@@ -70,13 +71,13 @@ int UTF8DrawBytes(const char *s, size_t len) noexcept;
// Line separator is U+2028 \xe2\x80\xa8
// Paragraph separator is U+2029 \xe2\x80\xa9
constexpr int UTF8SeparatorLength = 3;
-inline bool UTF8IsSeparator(const unsigned char *us) noexcept {
+constexpr bool UTF8IsSeparator(const unsigned char *us) noexcept {
return (us[0] == 0xe2) && (us[1] == 0x80) && ((us[2] == 0xa8) || (us[2] == 0xa9));
}
// NEL is U+0085 \xc2\x85
constexpr int UTF8NELLength = 2;
-inline bool UTF8IsNEL(const unsigned char *us) noexcept {
+constexpr bool UTF8IsNEL(const unsigned char *us) noexcept {
return (us[0] == 0xc2) && (us[1] == 0x85);
}