aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Document.cxx44
-rw-r--r--src/Document.h39
2 files changed, 44 insertions, 39 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index 10c8e9ce5..87cace721 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -125,6 +125,18 @@ size_t ActionDuration::ActionsInAllowedTime(double secondsAllowed) const noexcep
return std::lround(secondsAllowed / Duration());
}
+CharacterExtracted::CharacterExtracted(const unsigned char *charBytes, size_t widthCharBytes) noexcept {
+ const int utf8status = UTF8Classify(charBytes, widthCharBytes);
+ if (utf8status & UTF8MaskInvalid) {
+ // Treat as invalid and use up just one byte
+ character = unicodeReplacementChar;
+ widthBytes = 1;
+ } else {
+ character = UnicodeFromUTF8(charBytes);
+ widthBytes = utf8status & UTF8MaskWidth;
+ }
+}
+
Document::Document(DocumentOption options) :
cb(!FlagSet(options, DocumentOption::StylesNone), FlagSet(options, DocumentOption::TextLarge)),
durationStyleOneByte(0.000001, 0.0000001, 0.00001) {
@@ -917,7 +929,7 @@ bool Document::NextCharacter(Sci::Position &pos, int moveDir) const noexcept {
}
}
-Document::CharacterExtracted Document::CharacterAfter(Sci::Position position) const noexcept {
+CharacterExtracted Document::CharacterAfter(Sci::Position position) const noexcept {
if (position >= LengthNoExcept()) {
return CharacterExtracted(unicodeReplacementChar, 0);
}
@@ -931,13 +943,7 @@ Document::CharacterExtracted Document::CharacterAfter(Sci::Position position) co
unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 };
for (int b = 1; b<widthCharBytes; b++)
charBytes[b] = cb.UCharAt(position + b);
- const int utf8status = UTF8Classify(charBytes, widthCharBytes);
- if (utf8status & UTF8MaskInvalid) {
- // Treat as invalid and use up just one byte
- return CharacterExtracted(unicodeReplacementChar, 1);
- } else {
- return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth);
- }
+ return CharacterExtracted(charBytes, widthCharBytes);
} else {
if (IsDBCSLeadByteNoExcept(leadByte)) {
const unsigned char trailByte = cb.UCharAt(position + 1);
@@ -949,7 +955,7 @@ Document::CharacterExtracted Document::CharacterAfter(Sci::Position position) co
}
}
-Document::CharacterExtracted Document::CharacterBefore(Sci::Position position) const noexcept {
+CharacterExtracted Document::CharacterBefore(Sci::Position position) const noexcept {
if (position <= 0) {
return CharacterExtracted(unicodeReplacementChar, 0);
}
@@ -972,13 +978,7 @@ Document::CharacterExtracted Document::CharacterBefore(Sci::Position position) c
unsigned char charBytes[UTF8MaxBytes] = { 0, 0, 0, 0 };
for (Sci::Position b = 0; b<widthCharBytes; b++)
charBytes[b] = cb.UCharAt(startUTF + b);
- const int utf8status = UTF8Classify(charBytes, widthCharBytes);
- if (utf8status & UTF8MaskInvalid) {
- // Treat as invalid and use up just one byte
- return CharacterExtracted(unicodeReplacementChar, 1);
- } else {
- return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth);
- }
+ return CharacterExtracted(charBytes, widthCharBytes);
}
// Else invalid UTF-8 so return position of isolated trail byte
}
@@ -2037,7 +2037,7 @@ void Document::SetCaseFolder(std::unique_ptr<CaseFolder> pcf_) noexcept {
pcf = std::move(pcf_);
}
-Document::CharacterExtracted Document::ExtractCharacter(Sci::Position position) const noexcept {
+CharacterExtracted Document::ExtractCharacter(Sci::Position position) const noexcept {
const unsigned char leadByte = cb.UCharAt(position);
if (UTF8IsAscii(leadByte)) {
// Common case: ASCII character
@@ -2047,13 +2047,7 @@ Document::CharacterExtracted Document::ExtractCharacter(Sci::Position position)
unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 };
for (int b=1; b<widthCharBytes; b++)
charBytes[b] = cb.UCharAt(position + b);
- const int utf8status = UTF8Classify(charBytes, widthCharBytes);
- if (utf8status & UTF8MaskInvalid) {
- // Treat as invalid and use up just one byte
- return CharacterExtracted(unicodeReplacementChar, 1);
- } else {
- return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth);
- }
+ return CharacterExtracted(charBytes, widthCharBytes);
}
namespace {
@@ -3040,7 +3034,7 @@ public:
}
private:
void ReadCharacter() noexcept {
- const Document::CharacterExtracted charExtracted = doc->ExtractCharacter(position);
+ const CharacterExtracted charExtracted = doc->ExtractCharacter(position);
lenBytes = charExtracted.widthBytes;
if (charExtracted.character == unicodeReplacementChar) {
lenCharacters = 1;
diff --git a/src/Document.h b/src/Document.h
index 1cda253e1..ae784180a 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -227,6 +227,29 @@ public:
};
/**
+ * A whole character (code point) with a value and width in bytes.
+ * For UTF-8, the value is the code point value.
+ * For DBCS, its jamming the lead and trail bytes together.
+ * For 8 bit encodings, is just the byte value.
+ */
+struct CharacterExtracted {
+ unsigned int character;
+ unsigned int widthBytes;
+
+ CharacterExtracted(unsigned int character_, unsigned int widthBytes_) noexcept :
+ character(character_), widthBytes(widthBytes_) {
+ }
+
+ // For UTF-8:
+ CharacterExtracted(const unsigned char *charBytes, size_t widthCharBytes) noexcept;
+
+ // For DBCS characters turn 2 bytes into an int
+ static CharacterExtracted DBCS(unsigned char lead, unsigned char trail) noexcept {
+ return CharacterExtracted((lead << 8) | trail, 2);
+ }
+};
+
+/**
*/
class Document : PerLine, public Scintilla::IDocument, public Scintilla::ILoader {
@@ -276,18 +299,6 @@ private:
public:
- struct CharacterExtracted {
- unsigned int character;
- unsigned int widthBytes;
- CharacterExtracted(unsigned int character_, unsigned int widthBytes_) noexcept :
- character(character_), widthBytes(widthBytes_) {
- }
- // For DBCS characters turn 2 bytes into an int
- static CharacterExtracted DBCS(unsigned char lead, unsigned char trail) noexcept {
- return CharacterExtracted((lead << 8) | trail, 2);
- }
- };
-
Scintilla::EndOfLine eolMode;
/// Can also be SC_CP_UTF8 to enable UTF-8 mode
int dbcsCodePage;
@@ -341,8 +352,8 @@ public:
Sci::Position MovePositionOutsideChar(Sci::Position pos, Sci::Position moveDir, bool checkLineEnd=true) const noexcept;
Sci::Position NextPosition(Sci::Position pos, int moveDir) const noexcept;
bool NextCharacter(Sci::Position &pos, int moveDir) const noexcept; // Returns true if pos changed
- Document::CharacterExtracted CharacterAfter(Sci::Position position) const noexcept;
- Document::CharacterExtracted CharacterBefore(Sci::Position position) const noexcept;
+ CharacterExtracted CharacterAfter(Sci::Position position) const noexcept;
+ CharacterExtracted CharacterBefore(Sci::Position position) const noexcept;
Sci_Position SCI_METHOD GetRelativePosition(Sci_Position positionStart, Sci_Position characterOffset) const override;
Sci::Position GetRelativePositionUTF16(Sci::Position positionStart, Sci::Position characterOffset) const noexcept;
int SCI_METHOD GetCharacterAndWidth(Sci_Position position, Sci_Position *pWidth) const override;