aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--doc/ScintillaDoc.html7
-rw-r--r--include/ILexer.h1
-rw-r--r--lexlib/LexAccessor.h15
-rw-r--r--lexlib/StyleContext.h115
-rw-r--r--src/Document.cxx65
-rw-r--r--src/Document.h1
6 files changed, 147 insertions, 57 deletions
diff --git a/doc/ScintillaDoc.html b/doc/ScintillaDoc.html
index 281bbf957..abec92a5b 100644
--- a/doc/ScintillaDoc.html
+++ b/doc/ScintillaDoc.html
@@ -6321,13 +6321,18 @@ exception options.</p>
<p>
To allow lexers to determine the end position of a line and thus more easily support Unicode line ends
-<code>IDocument</code> is extended to <code>IDocumentWithLineEnd</code>.
+<code>IDocument</code> is extended to <code>IDocumentWithLineEnd</code>.</p>
+<p>The <code>GetRelativePosition</code> method allows navigating the document by whole characters and provides a standard
+conversion from UTF-8 bytes to a UTF-32 character or from DBCS to a 16 bit value.
+Invalid UTF-8 is reported as a character for each byte with values 0xDC80+byteValue, which are
+not valid Unicode code points.
</p>
<div class="highlighted">
<span class="S5">class</span><span class="S0"> </span>IDocumentWithLineEnd<span class="S0"> </span><span class="S10">:</span><span class="S0"> </span><span class="S5">public</span><span class="S0"> </span>IDocument<span class="S0"> </span><span class="S10">{</span><br />
<span class="S5">public</span><span class="S10">:</span><br />
<span class="S0">&nbsp; &nbsp; &nbsp; &nbsp; </span><span class="S5">virtual</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span>SCI_METHOD<span class="S0"> </span>LineEnd<span class="S10">(</span><span class="S5">int</span><span class="S0"> </span>line<span class="S10">)</span><span class="S0"> </span><span class="S5">const</span><span class="S0"> </span><span class="S10">=</span><span class="S0"> </span><span class="S4">0</span><span class="S10">;</span><br />
+<span class="S0">&nbsp; &nbsp; &nbsp; &nbsp; </span><span class="S5">virtual</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span>SCI_METHOD<span class="S0"> </span>GetRelativePosition<span class="S10">(</span><span class="S5">int</span><span class="S0"> </span>start<span class="S10">,</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span>characterOffset<span class="S10">,</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span><span class="S10">*</span>character<span class="S10">,</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span><span class="S10">*</span>width<span class="S10">)</span><span class="S0"> </span><span class="S5">const</span><span class="S0"> </span><span class="S10">=</span><span class="S0"> </span><span class="S4">0</span><span class="S10">;</span><br />
<span class="S10">};</span><br />
</div>
diff --git a/include/ILexer.h b/include/ILexer.h
index 1260c1373..9f9225ef2 100644
--- a/include/ILexer.h
+++ b/include/ILexer.h
@@ -48,6 +48,7 @@ public:
class IDocumentWithLineEnd : public IDocument {
public:
virtual int SCI_METHOD LineEnd(int line) const = 0;
+ virtual int SCI_METHOD GetRelativePosition(int start, int characterOffset, int *character, int *width) const = 0;
};
enum { lvOriginal=0, lvSubStyles=1 };
diff --git a/lexlib/LexAccessor.h b/lexlib/LexAccessor.h
index 4223f302d..92e719360 100644
--- a/lexlib/LexAccessor.h
+++ b/lexlib/LexAccessor.h
@@ -126,6 +126,21 @@ public:
return startNext - 1;
}
}
+ int GetRelativePosition(int start, int characterOffset, int *character, int *width) {
+ if (documentVersion >= dvLineEnd) {
+ return (static_cast<IDocumentWithLineEnd *>(pAccess))->GetRelativePosition(
+ start, characterOffset, character, width);
+ } else {
+ // Old version -> byte-oriented only
+ // Handle doc range overflow
+ int posNew = start + characterOffset;
+ if ((posNew < 0) || (posNew > Length()))
+ return -1;
+ *character = SafeGetCharAt(posNew, 0);
+ *width = 1;
+ return start + characterOffset;
+ }
+ }
int LevelAt(int line) const {
return pAccess->GetLevel(line);
}
diff --git a/lexlib/StyleContext.h b/lexlib/StyleContext.h
index 2c010645b..0b5dee379 100644
--- a/lexlib/StyleContext.h
+++ b/lexlib/StyleContext.h
@@ -51,35 +51,27 @@ class StyleContext {
LexAccessor &styler;
unsigned int endPos;
unsigned int lengthDocument;
+
+ // Used for optimizing GetRelativeCharacter
+ unsigned int posRelative;
+ unsigned int currentPosLastRelative;
+ int offsetRelative;
+
StyleContext &operator=(const StyleContext &);
- void GetNextChar(unsigned int pos) {
- chNext = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1, 0));
- if (styler.Encoding() == encUnicode) {
- if (chNext >= 0x80) {
- unsigned char bytes[4] = { static_cast<unsigned char>(chNext), 0, 0, 0 };
- for (int trail=1; trail<3; trail++) {
- bytes[trail] = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1+trail, 0));
- if (!((bytes[trail] >= 0x80) && (bytes[trail] < 0xc0))) {
- bytes[trail] = 0;
- break;
- }
- }
- chNext = UnicodeCodePoint(bytes);
- }
- } else if (styler.Encoding() == encDBCS) {
- if (styler.IsLeadByte(static_cast<char>(chNext))) {
- chNext = chNext << 8;
- chNext |= static_cast<unsigned char>(styler.SafeGetCharAt(pos+2, 0));
- }
+ void GetNextChar() {
+ if (styler.Encoding() == enc8bit) {
+ chNext = static_cast<unsigned char>(styler.SafeGetCharAt(currentPos+width, 0));
+ widthNext = 1;
+ } else {
+ styler.GetRelativePosition(currentPos+width, 0, &chNext, &widthNext);
}
- // End of line?
- // Trigger on CR only (Mac style) or either on LF from CR+LF (Dos/Win)
- // or on LF alone (Unix). Avoid triggering two times on Dos/Win.
+ // End of line determined from line end position, allowing CR, LF,
+ // CRLF and Unicode line ends as set by document.
if (currentLine < lineDocEnd)
- atLineEnd = static_cast<int>(pos) >= (lineStartNext-1);
+ atLineEnd = static_cast<int>(currentPos) >= (lineStartNext-1);
else // Last line
- atLineEnd = static_cast<int>(pos) >= lineStartNext;
+ atLineEnd = static_cast<int>(currentPos) >= lineStartNext;
}
public:
@@ -92,12 +84,17 @@ public:
int state;
int chPrev;
int ch;
+ int width;
int chNext;
+ int widthNext;
StyleContext(unsigned int startPos, unsigned int length,
int initStyle, LexAccessor &styler_, char chMask=31) :
styler(styler_),
endPos(startPos + length),
+ posRelative(0),
+ currentPosLastRelative(0x7FFFFFFF),
+ offsetRelative(0),
currentPos(startPos),
currentLine(-1),
lineStartNext(-1),
@@ -105,7 +102,9 @@ public:
state(initStyle & chMask), // Mask off all bits which aren't in the chMask.
chPrev(0),
ch(0),
- chNext(0) {
+ width(0),
+ chNext(0),
+ widthNext(1) {
styler.StartAt(startPos, chMask);
styler.StartSegment(startPos);
currentLine = styler.GetLine(startPos);
@@ -115,21 +114,14 @@ public:
endPos++;
lineDocEnd = styler.GetLine(lengthDocument);
atLineStart = static_cast<unsigned int>(styler.LineStart(currentLine)) == startPos;
- unsigned int pos = currentPos;
- ch = static_cast<unsigned char>(styler.SafeGetCharAt(pos, 0));
- if (styler.Encoding() == encUnicode) {
- // Get the current char
- GetNextChar(pos-1);
- ch = chNext;
- pos += BytesInUnicodeCodePoint(ch) - 1;
- } else if (styler.Encoding() == encDBCS) {
- if (styler.IsLeadByte(static_cast<char>(ch))) {
- pos++;
- ch = ch << 8;
- ch |= static_cast<unsigned char>(styler.SafeGetCharAt(pos, 0));
- }
- }
- GetNextChar(pos);
+
+ // Variable width is now 0 so GetNextChar gets the char at currentPos into chNext/widthNext
+ width = 0;
+ GetNextChar();
+ ch = chNext;
+ width = widthNext;
+
+ GetNextChar();
}
void Complete() {
styler.ColourTo(currentPos - ((currentPos > lengthDocument) ? 2 : 1), state);
@@ -146,23 +138,10 @@ public:
lineStartNext = styler.LineStart(currentLine+1);
}
chPrev = ch;
- if (styler.Encoding() == encUnicode) {
- currentPos += BytesInUnicodeCodePoint(ch);
- } else if (styler.Encoding() == encDBCS) {
- currentPos++;
- if (ch >= 0x100)
- currentPos++;
- } else {
- currentPos++;
- }
+ currentPos += width;
ch = chNext;
- if (styler.Encoding() == encUnicode) {
- GetNextChar(currentPos + BytesInUnicodeCodePoint(ch)-1);
- } else if (styler.Encoding() == encDBCS) {
- GetNextChar(currentPos + ((ch >= 0x100) ? 1 : 0));
- } else {
- GetNextChar(currentPos);
- }
+ width = widthNext;
+ GetNextChar();
} else {
atLineStart = false;
chPrev = ' ';
@@ -200,6 +179,30 @@ public:
int GetRelative(int n) {
return static_cast<unsigned char>(styler.SafeGetCharAt(currentPos+n, 0));
}
+ int GetRelativeCharacter(int n) {
+ if (n == 0)
+ return ch;
+ if (styler.Encoding() == enc8bit) {
+ // fast version for single byte encodings
+ return static_cast<unsigned char>(styler.SafeGetCharAt(currentPos + n, 0));
+ } else {
+ int ch = 0;
+ int width = 0;
+ //styler.GetRelativePosition(currentPos, n, &ch, &width);
+ if ((currentPosLastRelative != currentPos) ||
+ ((n > 0) && ((offsetRelative < 0) || (n < offsetRelative))) ||
+ ((n < 0) && ((offsetRelative > 0) || (n > offsetRelative)))) {
+ posRelative = currentPos;
+ offsetRelative = 0;
+ }
+ int diffRelative = n - offsetRelative;
+ int posNew = styler.GetRelativePosition(posRelative, diffRelative, &ch, &width);
+ posRelative = posNew;
+ currentPosLastRelative = currentPos;
+ offsetRelative = n;
+ return ch;
+ }
+ }
bool Match(char ch0) const {
return ch == static_cast<unsigned char>(ch0);
}
diff --git a/src/Document.cxx b/src/Document.cxx
index 8523a00fa..472567068 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -699,6 +699,71 @@ bool Document::NextCharacter(int &pos, int moveDir) const {
}
}
+static inline int UnicodeFromBytes(const unsigned char *us) {
+ if (us[0] < 0xC2) {
+ return us[0];
+ } else if (us[0] < 0xE0) {
+ return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F);
+ } else if (us[0] < 0xF0) {
+ return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F);
+ } else if (us[0] < 0xF5) {
+ return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F);
+ }
+ return us[0];
+}
+
+// Return -1 on out-of-bounds
+int SCI_METHOD Document::GetRelativePosition(int start, int characterOffset, int *character, int *width) const {
+ int pos = start;
+ if (dbcsCodePage) {
+ const int increment = (characterOffset > 0) ? 1 : -1;
+ while (characterOffset != 0) {
+ const int posNext = NextPosition(pos, increment);
+ if (posNext == pos)
+ return -1;
+ pos = posNext;
+ characterOffset -= increment;
+ }
+ const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(pos));
+ if (SC_CP_UTF8 == dbcsCodePage) {
+ if (UTF8IsAscii(leadByte)) {
+ // Single byte character or invalid
+ *character = leadByte;
+ *width = 1;
+ } else {
+ const int widthCharBytes = UTF8BytesOfLead[leadByte];
+ unsigned char charBytes[UTF8MaxBytes] = {leadByte,0,0,0};
+ for (int b=1; b<widthCharBytes; b++)
+ charBytes[b] = static_cast<unsigned char>(cb.CharAt(pos+b));
+ int utf8status = UTF8Classify(charBytes, widthCharBytes);
+ if (utf8status & UTF8MaskInvalid) {
+ // Report as singleton surrogate values which are invalid in Unicode
+ *character = 0xDC80 + leadByte;
+ *width = 1;
+ } else {
+ *character = UnicodeFromBytes(charBytes);
+ *width = utf8status & UTF8MaskWidth;
+ }
+ }
+ } else if (dbcsCodePage) {
+ if (IsDBCSLeadByte(leadByte)) {
+ *character = (leadByte << 8) | static_cast<unsigned char>(cb.CharAt(pos+1));
+ *width = 2;
+ } else {
+ *character = leadByte;
+ *width = 1;
+ }
+ }
+ } else {
+ pos = start + characterOffset;
+ if ((pos < 0) || (pos > Length()))
+ return -1;
+ *character = cb.CharAt(pos);
+ *width = 1;
+ }
+ return pos;
+}
+
int SCI_METHOD Document::CodePage() const {
return dbcsCodePage;
}
diff --git a/src/Document.h b/src/Document.h
index f3b49e1fe..8eb8db74a 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -279,6 +279,7 @@ public:
int MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd=true);
int NextPosition(int pos, int moveDir) const;
bool NextCharacter(int &pos, int moveDir) const; // Returns true if pos changed
+ int SCI_METHOD GetRelativePosition(int start, int characterOffset, int *character, int *width) const;
int SCI_METHOD CodePage() const;
bool SCI_METHOD IsDBCSLeadByte(char ch) const;
int SafeSegment(const char *text, int length, int lengthSegment) const;