aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/Document.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'src/Document.cxx')
-rw-r--r--src/Document.cxx119
1 files changed, 61 insertions, 58 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index 4e9366064..244e96e4f 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -465,51 +465,29 @@ int Document::LenChar(int pos) {
}
}
-static inline bool IsTrailByte(int ch) {
- return (ch >= 0x80) && (ch < 0xc0);
-}
-
-static int BytesFromLead(int leadByte) {
- if (leadByte > 0xF4) {
- // Characters longer than 4 bytes not possible in current UTF-8
- return 0;
- } else if (leadByte >= 0xF0) {
- return 4;
- } else if (leadByte >= 0xE0) {
- return 3;
- } else if (leadByte >= 0xC2) {
- return 2;
- }
- return 0;
-}
-
bool Document::InGoodUTF8(int pos, int &start, int &end) const {
- int lead = pos;
- while ((lead>0) && (pos-lead < 4) && IsTrailByte(static_cast<unsigned char>(cb.CharAt(lead-1))))
- lead--;
- start = 0;
- if (lead > 0) {
- start = lead-1;
- }
- int leadByte = static_cast<unsigned char>(cb.CharAt(start));
- int bytes = BytesFromLead(leadByte);
- if (bytes == 0) {
+ int trail = pos;
+ while ((trail>0) && (pos-trail < UTF8MaxBytes) && UTF8IsTrailByte(static_cast<unsigned char>(cb.CharAt(trail-1))))
+ trail--;
+ start = (trail > 0) ? trail-1 : trail;
+
+ const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(start));
+ const int widthCharBytes = UTF8BytesOfLead[leadByte];
+ if (widthCharBytes == 1) {
return false;
} else {
- int trailBytes = bytes - 1;
- int len = pos - lead + 1;
+ int trailBytes = widthCharBytes - 1;
+ int len = pos - start;
if (len > trailBytes)
// pos too far from lead
return false;
- // Check that there are enough trails for this lead
- int trail = pos + 1;
- while ((trail-lead<trailBytes) && (trail < Length())) {
- if (!IsTrailByte(static_cast<unsigned char>(cb.CharAt(trail)))) {
- return false;
- }
- trail++;
- }
- end = start + bytes;
+ char charBytes[UTF8MaxBytes] = {static_cast<char>(leadByte),0,0,0};
+ for (int b=1; b<widthCharBytes && ((start+b) < Length()); b++)
+ charBytes[b] = cb.CharAt(static_cast<int>(start+b));
+ int utf8status = UTF8Classify(reinterpret_cast<const unsigned char *>(charBytes), widthCharBytes);
+ if (utf8status & UTF8MaskInvalid)
+ return false;
+ end = start + widthCharBytes;
return true;
}
}
@@ -538,14 +516,18 @@ int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) {
if (dbcsCodePage) {
if (SC_CP_UTF8 == dbcsCodePage) {
unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos));
- int startUTF = pos;
- int endUTF = pos;
- if (IsTrailByte(ch) && InGoodUTF8(pos, startUTF, endUTF)) {
- // ch is a trail byte within a UTF-8 character
- if (moveDir > 0)
- pos = endUTF;
- else
- pos = startUTF;
+ // If ch is not a trail byte then pos is valid intercharacter position
+ if (UTF8IsTrailByte(ch)) {
+ int startUTF = pos;
+ int endUTF = pos;
+ if (InGoodUTF8(pos, startUTF, endUTF)) {
+ // ch is a trail byte within a UTF-8 character
+ if (moveDir > 0)
+ pos = endUTF;
+ else
+ pos = startUTF;
+ }
+ // Else invalid UTF-8 so return position of isolated trail byte
}
} else {
// Anchor DBCS calculations at start of line because start of line can
@@ -592,16 +574,37 @@ int Document::NextPosition(int pos, int moveDir) const {
if (dbcsCodePage) {
if (SC_CP_UTF8 == dbcsCodePage) {
- pos += increment;
- unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos));
- int startUTF = pos;
- int endUTF = pos;
- if (IsTrailByte(ch) && InGoodUTF8(pos, startUTF, endUTF)) {
- // ch is a trail byte within a UTF-8 character
- if (moveDir > 0)
- pos = endUTF;
- else
- pos = startUTF;
+ if (increment == 1) {
+ // Simple forward movement case so can avoid some checks
+ const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(pos));
+ if (UTF8IsAscii(leadByte)) {
+ // Single byte character or invalid
+ pos++;
+ } else {
+ const int widthCharBytes = UTF8BytesOfLead[leadByte];
+ char charBytes[UTF8MaxBytes] = {static_cast<char>(leadByte),0,0,0};
+ for (int b=1; b<widthCharBytes; b++)
+ charBytes[b] = cb.CharAt(static_cast<int>(pos+b));
+ int utf8status = UTF8Classify(reinterpret_cast<const unsigned char *>(charBytes), widthCharBytes);
+ if (utf8status & UTF8MaskInvalid)
+ pos++;
+ else
+ pos += utf8status & UTF8MaskWidth;
+ }
+ } else {
+ // Examine byte before position
+ pos--;
+ unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos));
+ // If ch is not a trail byte then pos is valid intercharacter position
+ if (UTF8IsTrailByte(ch)) {
+ // If ch is a trail byte in a valid UTF-8 character then return start of character
+ int startUTF = pos;
+ int endUTF = pos;
+ if (InGoodUTF8(pos, startUTF, endUTF)) {
+ pos = startUTF;
+ }
+ // Else invalid UTF-8 so return position of isolated trail byte
+ }
}
} else {
if (moveDir > 0) {
@@ -1246,7 +1249,7 @@ int Document::ParaDown(int pos) {
}
CharClassify::cc Document::WordCharClass(unsigned char ch) {
- if ((SC_CP_UTF8 == dbcsCodePage) && (ch >= 0x80))
+ if ((SC_CP_UTF8 == dbcsCodePage) && (!UTF8IsAscii(ch)))
return CharClassify::ccWord;
return charClass.GetClass(ch);
}