6 files changed, 147 insertions, 57 deletions
diff --git a/doc/ScintillaDoc.html b/doc/ScintillaDoc.html
index 281bbf957..abec92a5b 100644
--- a/doc/ScintillaDoc.html
+++ b/doc/ScintillaDoc.html
@@ -6321,13 +6321,18 @@ exception options.</p>
 
 <p>
 To allow lexers to determine the end position of a line and thus more easily support Unicode line ends
-<code>IDocument</code> is extended to <code>IDocumentWithLineEnd</code>.
+<code>IDocument</code> is extended to <code>IDocumentWithLineEnd</code>.</p>
+<p>The <code>GetRelativePosition</code> method allows navigating the document by whole characters and provides a standard
+conversion from UTF-8 bytes to a UTF-32 character or from DBCS to a 16 bit value.
+Invalid UTF-8 is reported as a character for each byte with values 0xDC80+byteValue, which are
+not valid Unicode code points.
 </p>
 
 <div class="highlighted">
 <span class="S5">class</span><span class="S0"> </span>IDocumentWithLineEnd<span class="S0"> </span><span class="S10">:</span><span class="S0"> </span><span class="S5">public</span><span class="S0"> </span>IDocument<span class="S0"> </span><span class="S10">{</span><br />
 <span class="S5">public</span><span class="S10">:</span><br />
 <span class="S0">&nbsp; &nbsp; &nbsp; &nbsp; </span><span class="S5">virtual</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span>SCI_METHOD<span class="S0"> </span>LineEnd<span class="S10">(</span><span class="S5">int</span><span class="S0"> </span>line<span class="S10">)</span><span class="S0"> </span><span class="S5">const</span><span class="S0"> </span><span class="S10">=</span><span class="S0"> </span><span class="S4">0</span><span class="S10">;</span><br />
+<span class="S0">&nbsp; &nbsp; &nbsp; &nbsp; </span><span class="S5">virtual</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span>SCI_METHOD<span class="S0"> </span>GetRelativePosition<span class="S10">(</span><span class="S5">int</span><span class="S0"> </span>start<span class="S10">,</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span>characterOffset<span class="S10">,</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span><span class="S10">*</span>character<span class="S10">,</span><span class="S0"> </span><span class="S5">int</span><span class="S0"> </span><span class="S10">*</span>width<span class="S10">)</span><span class="S0"> </span><span class="S5">const</span><span class="S0"> </span><span class="S10">=</span><span class="S0"> </span><span class="S4">0</span><span class="S10">;</span><br />
 <span class="S10">};</span><br />
 </div>
 
diff --git a/include/ILexer.h b/include/ILexer.h
index 1260c1373..9f9225ef2 100644
--- a/include/ILexer.h
+++ b/include/ILexer.h
@@ -48,6 +48,7 @@ public:
 class IDocumentWithLineEnd : public IDocument {
 public:
 	virtual int SCI_METHOD LineEnd(int line) const = 0;
+	virtual int SCI_METHOD GetRelativePosition(int start, int characterOffset, int *character, int *width) const = 0;
 };
 
 enum { lvOriginal=0, lvSubStyles=1 };
diff --git a/lexlib/LexAccessor.h b/lexlib/LexAccessor.h
index 4223f302d..92e719360 100644
--- a/lexlib/LexAccessor.h
+++ b/lexlib/LexAccessor.h
@@ -126,6 +126,21 @@ public:
 				return startNext - 1;
 		}
 	}
+	int GetRelativePosition(int start, int characterOffset, int *character, int *width) {
+		if (documentVersion >= dvLineEnd) {
+			return (static_cast<IDocumentWithLineEnd *>(pAccess))->GetRelativePosition(
+				start, characterOffset, character, width);
+		} else {
+			// Old version -> byte-oriented only
+			// Handle doc range overflow
+			int posNew = start + characterOffset;
+			if ((posNew < 0) || (posNew > Length()))
+				return -1;
+			*character = SafeGetCharAt(posNew, 0);
+			*width = 1;
+			return start + characterOffset;
+		}
+	}
 	int LevelAt(int line) const {
 		return pAccess->GetLevel(line);
 	}
diff --git a/lexlib/StyleContext.h b/lexlib/StyleContext.h
index 2c010645b..0b5dee379 100644
--- a/lexlib/StyleContext.h
+++ b/lexlib/StyleContext.h
@@ -51,35 +51,27 @@ class StyleContext {
 	LexAccessor &styler;
 	unsigned int endPos;
 	unsigned int lengthDocument;
+	
+	// Used for optimizing GetRelativeCharacter
+	unsigned int posRelative;
+	unsigned int currentPosLastRelative;
+	int offsetRelative;
+
 	StyleContext &operator=(const StyleContext &);
 
-	void GetNextChar(unsigned int pos) {
-		chNext = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1, 0));
-		if (styler.Encoding() == encUnicode) {
-			if (chNext >= 0x80) {
-				unsigned char bytes[4] = { static_cast<unsigned char>(chNext), 0, 0, 0 };
-				for (int trail=1; trail<3; trail++) {
-					bytes[trail] = static_cast<unsigned char>(styler.SafeGetCharAt(pos+1+trail, 0));
-					if (!((bytes[trail] >= 0x80) && (bytes[trail] < 0xc0))) {
-						bytes[trail] = 0;
-						break;
-					}
-				}
-				chNext = UnicodeCodePoint(bytes);
-			}
-		} else if (styler.Encoding() == encDBCS) {
-			if (styler.IsLeadByte(static_cast<char>(chNext))) {
-				chNext = chNext << 8;
-				chNext |= static_cast<unsigned char>(styler.SafeGetCharAt(pos+2, 0));
-			}
+	void GetNextChar() {
+		if (styler.Encoding() == enc8bit) {
+			chNext = static_cast<unsigned char>(styler.SafeGetCharAt(currentPos+width, 0));
+			widthNext = 1;
+		} else {
+			styler.GetRelativePosition(currentPos+width, 0, &chNext, &widthNext);
 		}
-		// End of line?
-		// Trigger on CR only (Mac style) or either on LF from CR+LF (Dos/Win)
-		// or on LF alone (Unix). Avoid triggering two times on Dos/Win.
+		// End of line determined from line end position, allowing CR, LF, 
+		// CRLF and Unicode line ends as set by document.
 		if (currentLine < lineDocEnd)
-			atLineEnd = static_cast<int>(pos) >= (lineStartNext-1);
+			atLineEnd = static_cast<int>(currentPos) >= (lineStartNext-1);
 		else // Last line
-			atLineEnd = static_cast<int>(pos) >= lineStartNext;
+			atLineEnd = static_cast<int>(currentPos) >= lineStartNext;
 	}
 
 public:
@@ -92,12 +84,17 @@ public:
 	int state;
 	int chPrev;
 	int ch;
+	int width;
 	int chNext;
+	int widthNext;
 
 	StyleContext(unsigned int startPos, unsigned int length,
                         int initStyle, LexAccessor &styler_, char chMask=31) :
 		styler(styler_),
 		endPos(startPos + length),
+		posRelative(0),
+		currentPosLastRelative(0x7FFFFFFF),
+		offsetRelative(0),
 		currentPos(startPos),
 		currentLine(-1),
 		lineStartNext(-1),
@@ -105,7 +102,9 @@ public:
 		state(initStyle & chMask), // Mask off all bits which aren't in the chMask.
 		chPrev(0),
 		ch(0),
-		chNext(0) {
+		width(0),
+		chNext(0),
+		widthNext(1) {
 		styler.StartAt(startPos, chMask);
 		styler.StartSegment(startPos);
 		currentLine = styler.GetLine(startPos);
@@ -115,21 +114,14 @@ public:
 			endPos++;
 		lineDocEnd = styler.GetLine(lengthDocument);
 		atLineStart = static_cast<unsigned int>(styler.LineStart(currentLine)) == startPos;
-		unsigned int pos = currentPos;
-		ch = static_cast<unsigned char>(styler.SafeGetCharAt(pos, 0));
-		if (styler.Encoding() == encUnicode) {
-			// Get the current char
-			GetNextChar(pos-1);
-			ch = chNext;
-			pos += BytesInUnicodeCodePoint(ch) - 1;
-		} else if (styler.Encoding() == encDBCS) {
-			if (styler.IsLeadByte(static_cast<char>(ch))) {
-				pos++;
-				ch = ch << 8;
-				ch |= static_cast<unsigned char>(styler.SafeGetCharAt(pos, 0));
-			}
-		}
-		GetNextChar(pos);
+
+		// Variable width is now 0 so GetNextChar gets the char at currentPos into chNext/widthNext
+		width = 0;
+		GetNextChar();
+		ch = chNext;
+		width = widthNext;
+
+		GetNextChar();
 	}
 	void Complete() {
 		styler.ColourTo(currentPos - ((currentPos > lengthDocument) ? 2 : 1), state);
@@ -146,23 +138,10 @@ public:
 				lineStartNext = styler.LineStart(currentLine+1);
 			}
 			chPrev = ch;
-			if (styler.Encoding() == encUnicode) {
-				currentPos += BytesInUnicodeCodePoint(ch);
-			} else if (styler.Encoding() == encDBCS) {
-				currentPos++;
-				if (ch >= 0x100)
-					currentPos++;
-			} else {
-				currentPos++;
-			}
+			currentPos += width;
 			ch = chNext;
-			if (styler.Encoding() == encUnicode) {
-				GetNextChar(currentPos + BytesInUnicodeCodePoint(ch)-1);
-			} else if (styler.Encoding() == encDBCS) {
-				GetNextChar(currentPos + ((ch >= 0x100) ? 1 : 0));
-			} else {
-				GetNextChar(currentPos);
-			}
+			width = widthNext;
+			GetNextChar();
 		} else {
 			atLineStart = false;
 			chPrev = ' ';
@@ -200,6 +179,30 @@ public:
 	int GetRelative(int n) {
 		return static_cast<unsigned char>(styler.SafeGetCharAt(currentPos+n, 0));
 	}
+	int GetRelativeCharacter(int n) {
+		if (n == 0)
+			return ch;
+		if (styler.Encoding() == enc8bit) {
+			// fast version for single byte encodings
+			return static_cast<unsigned char>(styler.SafeGetCharAt(currentPos + n, 0));
+		} else {
+			int ch = 0;
+			int width = 0;
+			//styler.GetRelativePosition(currentPos, n, &ch, &width);
+			if ((currentPosLastRelative != currentPos) ||
+				((n > 0) && ((offsetRelative < 0) || (n < offsetRelative))) ||
+				((n < 0) && ((offsetRelative > 0) || (n > offsetRelative)))) {
+				posRelative = currentPos;
+				offsetRelative = 0;
+			}
+			int diffRelative = n - offsetRelative;
+			int posNew = styler.GetRelativePosition(posRelative, diffRelative, &ch, &width);
+			posRelative = posNew;
+			currentPosLastRelative = currentPos;
+			offsetRelative = n;
+			return ch;
+		}
+	}
 	bool Match(char ch0) const {
 		return ch == static_cast<unsigned char>(ch0);
 	}
diff --git a/src/Document.cxx b/src/Document.cxx
index 8523a00fa..472567068 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -699,6 +699,71 @@ bool Document::NextCharacter(int &pos, int moveDir) const {
 	}
 }
 
+static inline int UnicodeFromBytes(const unsigned char *us) {
+	if (us[0] < 0xC2) {
+		return us[0];
+	} else if (us[0] < 0xE0) {
+		return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F);
+	} else if (us[0] < 0xF0) {
+		return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F);
+	} else if (us[0] < 0xF5) {
+		return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F);
+	}
+	return us[0];
+}
+
+// Return -1  on out-of-bounds
+int SCI_METHOD Document::GetRelativePosition(int start, int characterOffset, int *character, int *width) const {
+	int pos = start;
+	if (dbcsCodePage) {
+		const int increment = (characterOffset > 0) ? 1 : -1;
+		while (characterOffset != 0) {
+			const int posNext = NextPosition(pos, increment);
+			if (posNext == pos)
+				return -1;
+			pos = posNext;
+			characterOffset -= increment;
+		}
+		const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(pos));
+		if (SC_CP_UTF8 == dbcsCodePage) {
+			if (UTF8IsAscii(leadByte)) {
+				// Single byte character or invalid
+				*character = leadByte;
+				*width = 1;
+			} else {
+				const int widthCharBytes = UTF8BytesOfLead[leadByte];
+				unsigned char charBytes[UTF8MaxBytes] = {leadByte,0,0,0};
+				for (int b=1; b<widthCharBytes; b++)
+					charBytes[b] = static_cast<unsigned char>(cb.CharAt(pos+b));
+				int utf8status = UTF8Classify(charBytes, widthCharBytes);
+				if (utf8status & UTF8MaskInvalid) {
+					// Report as singleton surrogate values which are invalid in Unicode
+					*character = 0xDC80 + leadByte;
+					*width = 1;
+				} else {
+					*character = UnicodeFromBytes(charBytes);
+					*width = utf8status & UTF8MaskWidth;
+				}
+			}
+		} else if (dbcsCodePage) {
+			if (IsDBCSLeadByte(leadByte)) {
+				*character = (leadByte << 8) | static_cast<unsigned char>(cb.CharAt(pos+1));
+				*width = 2;
+			} else {
+				*character = leadByte;
+				*width = 1;
+			}
+		}
+	} else {
+		pos = start + characterOffset;
+		if ((pos < 0) || (pos > Length()))
+			return -1;
+		*character = cb.CharAt(pos);
+		*width = 1;
+	}
+	return pos;
+}
+
 int SCI_METHOD Document::CodePage() const {
 	return dbcsCodePage;
 }
diff --git a/src/Document.h b/src/Document.h
index f3b49e1fe..8eb8db74a 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -279,6 +279,7 @@ public:
 	int MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd=true);
 	int NextPosition(int pos, int moveDir) const;
 	bool NextCharacter(int &pos, int moveDir) const;	// Returns true if pos changed
+	int SCI_METHOD GetRelativePosition(int start, int characterOffset, int *character, int *width) const;
 	int SCI_METHOD CodePage() const;
 	bool SCI_METHOD IsDBCSLeadByte(char ch) const;
 	int SafeSegment(const char *text, int length, int lengthSegment) const;