5 files changed, 87 insertions, 50 deletions
diff --git a/src/Document.cxx b/src/Document.cxx
index 08bc24ecf..7b718f272 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -713,6 +713,55 @@ bool SCI_METHOD Document::IsDBCSLeadByte(char ch) const {
 	return false;
 }
 
+inline bool IsSpaceOrTab(int ch) {
+	return ch == ' ' || ch == '\t';
+}
+
+// Need to break text into segments near lengthSegment but taking into
+// account the encoding to not break inside a UTF-8 or DBCS character
+// and also trying to avoid breaking inside a pair of combining characters.
+// The segment length must always be long enough (more than 4 bytes)
+// so that there will be at least one whole character to make a segment.
+// For UTF-8, text must consist only of valid whole characters.
+// In preference order from best to worst:
+//   1) Break after space
+//   2) Break before punctuation
+//   3) Break after whole character
+
+int Document::SafeSegment(const char *text, int length, int lengthSegment) {
+	if (length <= lengthSegment)
+		return length;
+	int lastSpaceBreak = -1;
+	int lastPunctuationBreak = -1;
+	int lastEncodingAllowedBreak = -1;
+	for (int j=0; j < lengthSegment;) {
+		unsigned char ch = static_cast<unsigned char>(text[j]);
+		if (j > 0) {
+			if (IsSpaceOrTab(text[j - 1]) && !IsSpaceOrTab(text[j])) {
+				lastSpaceBreak = j;
+			}
+			if (ch < 'A') {
+				lastPunctuationBreak = j;
+			}
+		}
+		lastEncodingAllowedBreak = j;
+
+		if (dbcsCodePage == SC_CP_UTF8) {
+			j += (ch < 0x80) ? 1 : BytesFromLead(ch);
+		} else if (dbcsCodePage) {
+			j += IsDBCSLeadByte(ch) ? 2 : 1;
+		} else {
+			j++;
+		}
+	}
+	if (lastSpaceBreak >= 0) {
+		return lastSpaceBreak;
+	} else if (lastPunctuationBreak >= 0) {
+		return lastPunctuationBreak;
+	}
+	return lastEncodingAllowedBreak;
+}
+
 void Document::ModifiedAt(int pos) {
 	if (endStyled > pos)
 		endStyled = pos;
diff --git a/src/Document.h b/src/Document.h
index 274aa0baa..7858db727 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -275,6 +275,7 @@ public:
 	bool NextCharacter(int &pos, int moveDir);	// Returns true if pos changed
 	int SCI_METHOD CodePage() const;
 	bool SCI_METHOD IsDBCSLeadByte(char ch) const;
+	int SafeSegment(const char *text, int length, int lengthSegment);
 
 	// Gateways to modifying document
 	void ModifiedAt(int pos);
diff --git a/src/Editor.cxx b/src/Editor.cxx
index 03c7b1103..ae2d670ce 100644
--- a/src/Editor.cxx
+++ b/src/Editor.cxx
@@ -2185,7 +2185,7 @@ void Editor::LayoutLine(int line, Surface *surface, ViewStyle &vstyle, LineLayou
 						} else {
 							lastSegItalics = vstyle.styles[ll->styles[charInLine]].italic;
 							posCache.MeasureWidths(surface, vstyle, ll->styles[charInLine], ll->chars + startseg,
-							        lenSeg, ll->positions + startseg + 1);
+							        lenSeg, ll->positions + startseg + 1, pdoc);
 						}
 					}
 				} else {    // invisible
@@ -2801,7 +2801,7 @@ void Editor::DrawLine(Surface *surface, ViewStyle &vsDraw, int line, int lineVis
 
 	ll->psel = &sel;
 
-	BreakFinder bfBack(ll, lineStart, lineEnd, posLineStart, IsUnicodeMode(), xStartVisible, selBackDrawn);
+	BreakFinder bfBack(ll, lineStart, lineEnd, posLineStart, xStartVisible, selBackDrawn, pdoc);
 	int next = bfBack.First();
 
 	// Background drawing loop
@@ -2891,8 +2891,8 @@ void Editor::DrawLine(Surface *surface, ViewStyle &vsDraw, int line, int lineVis
 
 	inIndentation = subLine == 0;	// Do not handle indentation except on first subline.
 	// Foreground drawing loop
-	BreakFinder bfFore(ll, lineStart, lineEnd, posLineStart, IsUnicodeMode(), xStartVisible,
-		((!twoPhaseDraw && selBackDrawn) || vsDraw.selforeset));
+	BreakFinder bfFore(ll, lineStart, lineEnd, posLineStart, xStartVisible,
+		((!twoPhaseDraw && selBackDrawn) || vsDraw.selforeset), pdoc);
 	next = bfFore.First();
 
 	while (next < lineEnd) {
diff --git a/src/PositionCache.cxx b/src/PositionCache.cxx
index 52c4326c0..e59c12630 100644
--- a/src/PositionCache.cxx
+++ b/src/PositionCache.cxx
@@ -391,18 +391,19 @@ static int NextBadU(const char *s, int p, int len, int &trailBytes) {
 	return -1;
 }
 
-BreakFinder::BreakFinder(LineLayout *ll_, int lineStart_, int lineEnd_, int posLineStart_, bool utf8_, int xStart, bool breakForSelection) :
+BreakFinder::BreakFinder(LineLayout *ll_, int lineStart_, int lineEnd_, int posLineStart_,
+	int xStart, bool breakForSelection, Document *pdoc_) :
 	ll(ll_),
 	lineStart(lineStart_),
 	lineEnd(lineEnd_),
 	posLineStart(posLineStart_),
-	utf8(utf8_),
 	nextBreak(lineStart_),
 	saeSize(0),
 	saeLen(0),
 	saeCurrentPos(0),
 	saeNext(0),
-	subBreak(-1) {
+	subBreak(-1),
+	pdoc(pdoc_) {
 	saeSize = 8;
 	selAndEdge = new int[saeSize];
 	for (unsigned int j=0; j < saeSize; j++) {
@@ -435,7 +436,7 @@ BreakFinder::BreakFinder(LineLayout *ll_, int lineStart_, int lineEnd_, int posL
 	Insert(ll->edgeColumn - 1);
 	Insert(lineEnd - 1);
 
-	if (utf8) {
+	if (pdoc && (SC_CP_UTF8 == pdoc->dbcsCodePage)) {
 		int trailBytes=0;
 		for (int pos = -1;;) {
 			pos = NextBadU(ll->chars, pos, lineEnd, trailBytes);
@@ -456,10 +457,6 @@ int BreakFinder::First() const {
 	return nextBreak;
 }
 
-static bool IsTrailByte(int ch) {
-	return (ch >= 0x80) && (ch < (0x80 + 0x40));
-}
-
 int BreakFinder::Next() {
 	if (subBreak == -1) {
 		int prev = nextBreak;
@@ -490,34 +487,7 @@ int BreakFinder::Next() {
 		subBreak = -1;
 		return nextBreak;
 	} else {
-		int lastGoodBreak = -1;
-		int lastOKBreak = -1;
-		int lastUTF8Break = -1;
-		int j;
-		for (j = subBreak + 1; j <= nextBreak; j++) {
-			if (IsSpaceOrTab(ll->chars[j - 1]) && !IsSpaceOrTab(ll->chars[j])) {
-				lastGoodBreak = j;
-			}
-			if (static_cast<unsigned char>(ll->chars[j]) < 'A') {
-				lastOKBreak = j;
-			}
-			if (utf8 && !IsTrailByte(static_cast<unsigned char>(ll->chars[j]))) {
-				lastUTF8Break = j;
-			}
-			if (((j - subBreak) >= lengthEachSubdivision) &&
-				((lastGoodBreak >= 0) || (lastOKBreak >= 0) || (lastUTF8Break >= 0))) {
-				break;
-			}
-		}
-		if (lastGoodBreak >= 0) {
-			subBreak = lastGoodBreak;
-		} else if (lastOKBreak >= 0) {
-			subBreak = lastOKBreak;
-		} else if (lastUTF8Break >= 0) {
-			subBreak = lastUTF8Break;
-		} else {
-			subBreak = nextBreak;
-		}
+		subBreak += pdoc->SafeSegment(ll->chars + subBreak, nextBreak-subBreak, lengthEachSubdivision);
 		if (subBreak >= nextBreak) {
 			subBreak = -1;
 			return nextBreak;
@@ -624,7 +594,8 @@ void PositionCache::SetSize(size_t size_) {
 }
 
 void PositionCache::MeasureWidths(Surface *surface, ViewStyle &vstyle, unsigned int styleNumber,
-	const char *s, unsigned int len, int *positions) {
+	const char *s, unsigned int len, int *positions, Document *pdoc) {
+
 	allClear = false;
 	int probe = -1;
 	if ((size > 0) && (len < 30)) {
@@ -646,7 +617,22 @@ void PositionCache::MeasureWidths(Surface *surface, ViewStyle &vstyle, unsigned
 			probe = probe2;
 		}
 	}
-	surface->MeasureWidths(vstyle.styles[styleNumber].font, s, len, positions);
+	if (len > BreakFinder::lengthStartSubdivision) {
+		// Break up into segments
+		unsigned int startSegment = 0;
+		int xStartSegment = 0;
+		while (startSegment < len) {
+			unsigned int lenSegment = pdoc->SafeSegment(s + startSegment, len - startSegment, BreakFinder::lengthEachSubdivision);
+			surface->MeasureWidths(vstyle.styles[styleNumber].font, s + startSegment, lenSegment, positions + startSegment);
+			for (unsigned int inSeg = 0; inSeg < lenSegment; inSeg++) {
+				positions[startSegment + inSeg] += xStartSegment;
+			}
+			xStartSegment = positions[startSegment + lenSegment - 1];
+			startSegment += lenSegment;
+		}
+	} else {
+		surface->MeasureWidths(vstyle.styles[styleNumber].font, s, len, positions);
+	}
 	if (probe >= 0) {
 		clock++;
 		if (clock > 60000) {
diff --git a/src/PositionCache.h b/src/PositionCache.h
index a76da574c..8bd4f1b43 100644
--- a/src/PositionCache.h
+++ b/src/PositionCache.h
@@ -117,16 +117,10 @@ public:
 
 // Class to break a line of text into shorter runs at sensible places.
 class BreakFinder {
-	// If a whole run is longer than lengthStartSubdivision then subdivide
-	// into smaller runs at spaces or punctuation.
-	enum { lengthStartSubdivision = 300 };
-	// Try to make each subdivided run lengthEachSubdivision or shorter.
-	enum { lengthEachSubdivision = 100 };
 	LineLayout *ll;
 	int lineStart;
 	int lineEnd;
 	int posLineStart;
-	bool utf8;
 	int nextBreak;
 	int *selAndEdge;
 	unsigned int saeSize;
@@ -134,9 +128,16 @@ class BreakFinder {
 	unsigned int saeCurrentPos;
 	int saeNext;
 	int subBreak;
+	Document *pdoc;
 	void Insert(int val);
 public:
-	BreakFinder(LineLayout *ll_, int lineStart_, int lineEnd_, int posLineStart_, bool utf8_, int xStart, bool breakForSelection);
+	// If a whole run is longer than lengthStartSubdivision then subdivide
+	// into smaller runs at spaces or punctuation.
+	enum { lengthStartSubdivision = 300 };
+	// Try to make each subdivided run lengthEachSubdivision or shorter.
+	enum { lengthEachSubdivision = 100 };
+	BreakFinder(LineLayout *ll_, int lineStart_, int lineEnd_, int posLineStart_, 
+		int xStart, bool breakForSelection, Document *pdoc_);
 	~BreakFinder();
 	int First() const;
 	int Next();
@@ -154,7 +155,7 @@ public:
 	void SetSize(size_t size_);
 	int GetSize() const { return size; }
 	void MeasureWidths(Surface *surface, ViewStyle &vstyle, unsigned int styleNumber,
-		const char *s, unsigned int len, int *positions);
+		const char *s, unsigned int len, int *positions, Document *pdoc);
 };
 
 inline bool IsSpaceOrTab(int ch) {