aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Document.cxx2
-rw-r--r--src/UniConversion.cxx180
-rw-r--r--src/UniConversion.h10
3 files changed, 102 insertions, 90 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index 412798def..48913a16c 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -116,8 +116,6 @@ Document::Document(int options) :
matchesValid = false;
- UTF8BytesOfLeadInitialise();
-
perLineData[ldMarkers] = std::make_unique<LineMarkers>();
perLineData[ldLevels] = std::make_unique<LineLevels>();
perLineData[ldState] = std::make_unique<LineState>();
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx
index 8e537c689..19b968932 100644
--- a/src/UniConversion.cxx
+++ b/src/UniConversion.cxx
@@ -65,35 +65,15 @@ void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len) {
putf[k] = '\0';
}
-unsigned int UTF8CharLength(unsigned char ch) {
- if (ch < 0x80) {
- return 1;
- } else if (ch < 0x80 + 0x40 + 0x20) {
- return 2;
- } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
- return 3;
- } else {
- return 4;
- }
-}
-
size_t UTF16Length(const char *s, size_t len) {
size_t ulen = 0;
- size_t charLen;
- for (size_t i = 0; i<len;) {
- const unsigned char ch = static_cast<unsigned char>(s[i]);
- if (ch < 0x80) {
- charLen = 1;
- } else if (ch < 0x80 + 0x40 + 0x20) {
- charLen = 2;
- } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
- charLen = 3;
- } else {
- charLen = 4;
- ulen++;
- }
- i += charLen;
- ulen++;
+ const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
+ for (size_t i = 0; i < len;) {
+ const unsigned char ch = us[i];
+ const unsigned int byteCount = UTF8BytesOfLead[ch];
+ const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount);
+ i += byteCount;
+ ulen += (i > len) ? 1 : utf16Len;
}
return ulen;
}
@@ -104,39 +84,60 @@ constexpr unsigned char TrailByteValue(unsigned char c) {
return c & 0b0011'1111;
}
-const unsigned char utf8Start3 = 0b1110'0000;
-const unsigned char utf8Start4 = 0b1111'0000;
-
size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
size_t ui = 0;
const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
- size_t i = 0;
- while ((i<len) && (ui<tlen)) {
- unsigned char ch = us[i++];
- if (ch < 0x80) {
+ for (size_t i = 0; i < len;) {
+ unsigned char ch = us[i];
+ const unsigned int byteCount = UTF8BytesOfLead[ch];
+ unsigned int value;
+
+ if (i + byteCount > len) {
+ // Trying to read past end but still have space to write
+ if (ui < tlen) {
+ tbuf[ui] = ch;
+ ui++;
+ }
+ break;
+ }
+
+ const size_t outLen = (byteCount < 4) ? 1 : 2;
+ if (ui + outLen > tlen) {
+ throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
+ }
+
+ i++;
+ switch (byteCount) {
+ case 1:
tbuf[ui] = ch;
- } else if (ch < utf8Start3) {
- tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);
+ break;
+ case 2:
+ value = (ch & 0x1F) << 6;
ch = us[i++];
- tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + TrailByteValue(ch));
- } else if (ch < utf8Start4) {
- tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);
+ value += TrailByteValue(ch);
+ tbuf[ui] = static_cast<wchar_t>(value);
+ break;
+ case 3:
+ value = (ch & 0xF) << 12;
ch = us[i++];
- tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (TrailByteValue(ch) << 6));
+ value += (TrailByteValue(ch) << 6);
ch = us[i++];
- tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + TrailByteValue(ch));
- } else {
+ value += TrailByteValue(ch);
+ tbuf[ui] = static_cast<wchar_t>(value);
+ break;
+ default:
// Outside the BMP so need two surrogates
- int val = (ch & 0x7) << 18;
+ value = (ch & 0x7) << 18;
ch = us[i++];
- val += TrailByteValue(ch) << 12;
+ value += TrailByteValue(ch) << 12;
ch = us[i++];
- val += TrailByteValue(ch) << 6;
+ value += TrailByteValue(ch) << 6;
ch = us[i++];
- val += TrailByteValue(ch);
- tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
+ value += TrailByteValue(ch);
+ tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
ui++;
- tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
+ tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST);
+ break;
}
ui++;
}
@@ -144,25 +145,44 @@ size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
}
size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) {
- size_t ui=0;
+ size_t ui = 0;
const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
- size_t i=0;
- while ((i<len) && (ui<tlen)) {
- unsigned char ch = us[i++];
- unsigned int value = 0;
- if (ch < 0x80) {
+ for (size_t i = 0; i < len;) {
+ unsigned char ch = us[i];
+ const unsigned int byteCount = UTF8BytesOfLead[ch];
+ unsigned int value;
+
+ if (i + byteCount > len) {
+ // Trying to read past end but still have space to write
+ if (ui < tlen) {
+ tbuf[ui] = ch;
+ ui++;
+ }
+ break;
+ }
+
+ if (ui == tlen) {
+ throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
+ }
+
+ i++;
+ switch (byteCount) {
+ case 1:
value = ch;
- } else if (((len-i) >= 1) && (ch < utf8Start3)) {
+ break;
+ case 2:
value = (ch & 0x1F) << 6;
ch = us[i++];
value += TrailByteValue(ch);
- } else if (((len-i) >= 2) && (ch < utf8Start4)) {
+ break;
+ case 3:
value = (ch & 0xF) << 12;
ch = us[i++];
value += TrailByteValue(ch) << 6;
ch = us[i++];
value += TrailByteValue(ch);
- } else if ((len-i) >= 3) {
+ break;
+ default:
value = (ch & 0x7) << 18;
ch = us[i++];
value += TrailByteValue(ch) << 12;
@@ -170,6 +190,7 @@ size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen)
value += TrailByteValue(ch) << 6;
ch = us[i++];
value += TrailByteValue(ch);
+ break;
}
tbuf[ui] = value;
ui++;
@@ -188,33 +209,24 @@ unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {
}
}
-int UTF8BytesOfLead[256];
-static bool initialisedBytesOfLead = false;
-
-static int BytesFromLead(int leadByte) {
- if (leadByte < 0xC2) {
- // Single byte or invalid
- return 1;
- } else if (leadByte < 0xE0) {
- return 2;
- } else if (leadByte < 0xF0) {
- return 3;
- } else if (leadByte < 0xF5) {
- return 4;
- } else {
- // Characters longer than 4 bytes not possible in current UTF-8
- return 1;
- }
-}
-
-void UTF8BytesOfLeadInitialise() {
- if (!initialisedBytesOfLead) {
- for (int i=0; i<256; i++) {
- UTF8BytesOfLead[i] = BytesFromLead(i);
- }
- initialisedBytesOfLead = true;
- }
-}
+const unsigned char UTF8BytesOfLead[256] = {
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
+1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
+2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
+3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
+4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
+};
// Return both the width of the first character in the string and a status
// saying whether it is valid or invalid.
diff --git a/src/UniConversion.h b/src/UniConversion.h
index 2f358c9c5..0f22c06e6 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -16,17 +16,15 @@ const int unicodeReplacementChar = 0xFFFD;
size_t UTF8Length(const wchar_t *uptr, size_t tlen);
void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len);
-unsigned int UTF8CharLength(unsigned char ch);
size_t UTF16Length(const char *s, size_t len);
size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen);
size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen);
unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf);
std::string FixInvalidUTF8(const std::string &text);
-extern int UTF8BytesOfLead[256];
-void UTF8BytesOfLeadInitialise();
+extern const unsigned char UTF8BytesOfLead[256];
-inline bool UTF8IsTrailByte(int ch) {
+inline bool UTF8IsTrailByte(unsigned char ch) {
return (ch >= 0x80) && (ch < 0xc0);
}
@@ -64,6 +62,10 @@ inline unsigned int UTF16CharLength(wchar_t uch) {
return ((uch >= SURROGATE_LEAD_FIRST) && (uch <= SURROGATE_LEAD_LAST)) ? 2 : 1;
}
+inline unsigned int UTF16LengthFromUTF8ByteCount(unsigned int byteCount) {
+ return (byteCount < 4) ? 1 : 2;
+}
+
}
#endif