diff options
-rw-r--r-- | doc/ScintillaDoc.html | 7 | ||||
-rw-r--r-- | include/ILexer.h | 1 | ||||
-rw-r--r-- | lexlib/LexAccessor.h | 15 | ||||
-rw-r--r-- | lexlib/StyleContext.h | 115 | ||||
-rw-r--r-- | src/Document.cxx | 65 | ||||
-rw-r--r-- | src/Document.h | 1 |
6 files changed, 147 insertions, 57 deletions
diff --git a/doc/ScintillaDoc.html b/doc/ScintillaDoc.html index 281bbf957..abec92a5b 100644 --- a/doc/ScintillaDoc.html +++ b/doc/ScintillaDoc.html @@ -6321,13 +6321,18 @@ exception options.</p> <p> To allow lexers to determine the end position of a line and thus more easily support Unicode line ends -<code>IDocument</code> is extended to <code>IDocumentWithLineEnd</code>. +<code>IDocument</code> is extended to <code>IDocumentWithLineEnd</code>.</p> +<p>The <code>GetRelativePosition</code> method allows navigating the document by whole characters and provides a standard +conversion from UTF-8 bytes to a UTF-32 character or from DBCS to a 16 bit value. +Invalid UTF-8 is reported as a character for each byte with values 0xDC80+byteValue, which are +not valid Unicode code points. </p> <div class="highlighted"> <span class="S5">class</span><span class="S0"> </span>IDocumentWithLineEnd<span class="S0"> </span><span class="S10">:</span><span class="S0"> </span><span class="S5">public</span><span class="S0"> </span>IDocument<span class="S0"> </span><span class="S10">{</span><br /> <span class="S5">public</span><span class="S10">:</span><br /> <span class="S0"> </span><span class="S5">virtual</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span>SCI_METHOD<span class="S0"> </span>LineEnd<span class="S10">(</span><span class="S5">int</span><span class="S0"> </span>line<span class="S10">)</span><span class="S0"> </span><span class="S5">const</span><span class="S0"> </span><span class="S10">=</span><span class="S0"> </span><span class="S4">0</span><span class="S10">;</span><br /> +<span class="S0"> </span><span class="S5">virtual</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span>SCI_METHOD<span class="S0"> </span>GetRelativePosition<span class="S10">(</span><span class="S5">int</span><span class="S0"> </span>start<span class="S10">,</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span>characterOffset<span class="S10">,</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span><span class="S10">*</span>character<span class="S10">,</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span><span class="S10">*</span>width<span class="S10">)</span><span class="S0"> </span><span class="S5">const</span><span class="S0"> </span><span class="S10">=</span><span class="S0"> </span><span class="S4">0</span><span class="S10">;</span><br /> <span class="S10">};</span><br /> </div> diff --git a/include/ILexer.h b/include/ILexer.h index 1260c1373..9f9225ef2 100644 --- a/include/ILexer.h +++ b/include/ILexer.h @@ -48,6 +48,7 @@ public: class IDocumentWithLineEnd : public IDocument { public: virtual int SCI_METHOD LineEnd(int line) const = 0; + virtual int SCI_METHOD GetRelativePosition(int start, int characterOffset, int *character, int *width) const = 0; }; enum { lvOriginal=0, lvSubStyles=1 }; diff --git a/lexlib/LexAccessor.h b/lexlib/LexAccessor.h index 4223f302d..92e719360 100644 --- a/lexlib/LexAccessor.h +++ b/lexlib/LexAccessor.h @@ -126,6 +126,21 @@ public: return startNext - 1; } } + int GetRelativePosition(int start, int characterOffset, int *character, int *width) { + if (documentVersion >= dvLineEnd) { + return (static_cast<IDocumentWithLineEnd *>(pAccess))->GetRelativePosition( + start, characterOffset, character, width); + } else { + // Old version -> byte-oriented only + // Handle doc range overflow + int posNew = start + characterOffset; + if ((posNew < 0) || (posNew > Length())) + return -1; + *character = SafeGetCharAt(posNew, 0); + *width = 1; + return start + characterOffset; + } + } int LevelAt(int line) const { return pAccess->GetLevel(line); } diff --git a/lexlib/StyleContext.h b/lexlib/StyleContext.h index 2c010645b..0b5dee379 100644 --- a/lexlib/StyleContext.h +++ b/lexlib/StyleContext.h @@ -51,35 +51,27 @@ class StyleContext { LexAccessor &styler; unsigned int endPos; unsigned int lengthDocument; + + // Used for optimizing GetRelativeCharacter + unsigned int posRelative; + unsigned int currentPosLastRelative; + int offsetRelative; + StyleContext &operator=(const StyleContext &); - void GetNextChar(unsigned int pos) { - chNext = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1, 0)); - if (styler.Encoding() == encUnicode) { - if (chNext >= 0x80) { - unsigned char bytes[4] = { static_cast<unsigned char>(chNext), 0, 0, 0 }; - for (int trail=1; trail<3; trail++) { - bytes[trail] = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1+trail, 0)); - if (!((bytes[trail] >= 0x80) && (bytes[trail] < 0xc0))) { - bytes[trail] = 0; - break; - } - } - chNext = UnicodeCodePoint(bytes); - } - } else if (styler.Encoding() == encDBCS) { - if (styler.IsLeadByte(static_cast<char>(chNext))) { - chNext = chNext << 8; - chNext |= static_cast<unsigned char>(styler.SafeGetCharAt(pos+2, 0)); - } + void GetNextChar() { + if (styler.Encoding() == enc8bit) { + chNext = static_cast<unsigned char>(styler.SafeGetCharAt(currentPos+width, 0)); + widthNext = 1; + } else { + styler.GetRelativePosition(currentPos+width, 0, &chNext, &widthNext); } - // End of line? - // Trigger on CR only (Mac style) or either on LF from CR+LF (Dos/Win) - // or on LF alone (Unix). Avoid triggering two times on Dos/Win. + // End of line determined from line end position, allowing CR, LF, + // CRLF and Unicode line ends as set by document. if (currentLine < lineDocEnd) - atLineEnd = static_cast<int>(pos) >= (lineStartNext-1); + atLineEnd = static_cast<int>(currentPos) >= (lineStartNext-1); else // Last line - atLineEnd = static_cast<int>(pos) >= lineStartNext; + atLineEnd = static_cast<int>(currentPos) >= lineStartNext; } public: @@ -92,12 +84,17 @@ public: int state; int chPrev; int ch; + int width; int chNext; + int widthNext; StyleContext(unsigned int startPos, unsigned int length, int initStyle, LexAccessor &styler_, char chMask=31) : styler(styler_), endPos(startPos + length), + posRelative(0), + currentPosLastRelative(0x7FFFFFFF), + offsetRelative(0), currentPos(startPos), currentLine(-1), lineStartNext(-1), @@ -105,7 +102,9 @@ public: state(initStyle & chMask), // Mask off all bits which aren't in the chMask. chPrev(0), ch(0), - chNext(0) { + width(0), + chNext(0), + widthNext(1) { styler.StartAt(startPos, chMask); styler.StartSegment(startPos); currentLine = styler.GetLine(startPos); @@ -115,21 +114,14 @@ public: endPos++; lineDocEnd = styler.GetLine(lengthDocument); atLineStart = static_cast<unsigned int>(styler.LineStart(currentLine)) == startPos; - unsigned int pos = currentPos; - ch = static_cast<unsigned char>(styler.SafeGetCharAt(pos, 0)); - if (styler.Encoding() == encUnicode) { - // Get the current char - GetNextChar(pos-1); - ch = chNext; - pos += BytesInUnicodeCodePoint(ch) - 1; - } else if (styler.Encoding() == encDBCS) { - if (styler.IsLeadByte(static_cast<char>(ch))) { - pos++; - ch = ch << 8; - ch |= static_cast<unsigned char>(styler.SafeGetCharAt(pos, 0)); - } - } - GetNextChar(pos); + + // Variable width is now 0 so GetNextChar gets the char at currentPos into chNext/widthNext + width = 0; + GetNextChar(); + ch = chNext; + width = widthNext; + + GetNextChar(); } void Complete() { styler.ColourTo(currentPos - ((currentPos > lengthDocument) ? 2 : 1), state); @@ -146,23 +138,10 @@ public: lineStartNext = styler.LineStart(currentLine+1); } chPrev = ch; - if (styler.Encoding() == encUnicode) { - currentPos += BytesInUnicodeCodePoint(ch); - } else if (styler.Encoding() == encDBCS) { - currentPos++; - if (ch >= 0x100) - currentPos++; - } else { - currentPos++; - } + currentPos += width; ch = chNext; - if (styler.Encoding() == encUnicode) { - GetNextChar(currentPos + BytesInUnicodeCodePoint(ch)-1); - } else if (styler.Encoding() == encDBCS) { - GetNextChar(currentPos + ((ch >= 0x100) ? 1 : 0)); - } else { - GetNextChar(currentPos); - } + width = widthNext; + GetNextChar(); } else { atLineStart = false; chPrev = ' '; @@ -200,6 +179,30 @@ public: int GetRelative(int n) { return static_cast<unsigned char>(styler.SafeGetCharAt(currentPos+n, 0)); } + int GetRelativeCharacter(int n) { + if (n == 0) + return ch; + if (styler.Encoding() == enc8bit) { + // fast version for single byte encodings + return static_cast<unsigned char>(styler.SafeGetCharAt(currentPos + n, 0)); + } else { + int ch = 0; + int width = 0; + //styler.GetRelativePosition(currentPos, n, &ch, &width); + if ((currentPosLastRelative != currentPos) || + ((n > 0) && ((offsetRelative < 0) || (n < offsetRelative))) || + ((n < 0) && ((offsetRelative > 0) || (n > offsetRelative)))) { + posRelative = currentPos; + offsetRelative = 0; + } + int diffRelative = n - offsetRelative; + int posNew = styler.GetRelativePosition(posRelative, diffRelative, &ch, &width); + posRelative = posNew; + currentPosLastRelative = currentPos; + offsetRelative = n; + return ch; + } + } bool Match(char ch0) const { return ch == static_cast<unsigned char>(ch0); } diff --git a/src/Document.cxx b/src/Document.cxx index 8523a00fa..472567068 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -699,6 +699,71 @@ bool Document::NextCharacter(int &pos, int moveDir) const { } } +static inline int UnicodeFromBytes(const unsigned char *us) { + if (us[0] < 0xC2) { + return us[0]; + } else if (us[0] < 0xE0) { + return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F); + } else if (us[0] < 0xF0) { + return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F); + } else if (us[0] < 0xF5) { + return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F); + } + return us[0]; +} + +// Return -1 on out-of-bounds +int SCI_METHOD Document::GetRelativePosition(int start, int characterOffset, int *character, int *width) const { + int pos = start; + if (dbcsCodePage) { + const int increment = (characterOffset > 0) ? 1 : -1; + while (characterOffset != 0) { + const int posNext = NextPosition(pos, increment); + if (posNext == pos) + return -1; + pos = posNext; + characterOffset -= increment; + } + const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(pos)); + if (SC_CP_UTF8 == dbcsCodePage) { + if (UTF8IsAscii(leadByte)) { + // Single byte character or invalid + *character = leadByte; + *width = 1; + } else { + const int widthCharBytes = UTF8BytesOfLead[leadByte]; + unsigned char charBytes[UTF8MaxBytes] = {leadByte,0,0,0}; + for (int b=1; b<widthCharBytes; b++) + charBytes[b] = static_cast<unsigned char>(cb.CharAt(pos+b)); + int utf8status = UTF8Classify(charBytes, widthCharBytes); + if (utf8status & UTF8MaskInvalid) { + // Report as singleton surrogate values which are invalid in Unicode + *character = 0xDC80 + leadByte; + *width = 1; + } else { + *character = UnicodeFromBytes(charBytes); + *width = utf8status & UTF8MaskWidth; + } + } + } else if (dbcsCodePage) { + if (IsDBCSLeadByte(leadByte)) { + *character = (leadByte << 8) | static_cast<unsigned char>(cb.CharAt(pos+1)); + *width = 2; + } else { + *character = leadByte; + *width = 1; + } + } + } else { + pos = start + characterOffset; + if ((pos < 0) || (pos > Length())) + return -1; + *character = cb.CharAt(pos); + *width = 1; + } + return pos; +} + int SCI_METHOD Document::CodePage() const { return dbcsCodePage; } diff --git a/src/Document.h b/src/Document.h index f3b49e1fe..8eb8db74a 100644 --- a/src/Document.h +++ b/src/Document.h @@ -279,6 +279,7 @@ public: int MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd=true); int NextPosition(int pos, int moveDir) const; bool NextCharacter(int &pos, int moveDir) const; // Returns true if pos changed + int SCI_METHOD GetRelativePosition(int start, int characterOffset, int *character, int *width) const; int SCI_METHOD CodePage() const; bool SCI_METHOD IsDBCSLeadByte(char ch) const; int SafeSegment(const char *text, int length, int lengthSegment) const; |