From d6ac5bf56d40512ac0634d7a5bee6f7328b7d41f Mon Sep 17 00:00:00 2001
From: nyamatongwe <devnull@localhost>
Date: Sat, 19 Jan 2013 11:40:47 +1100
Subject: Support the three Unicode line ends NEL, LS, and PS in CellBuffer,
 Document, Editor and the message interface. Will only be turned on for lexers
 that support Unicode line ends.

---
 src/CellBuffer.cxx  | 115 ++++++++++++++++++++++++++++++++++++++++++++++++----
 src/CellBuffer.h    |   5 +++
 src/Document.cxx    |  34 +++++++++++++++-
 src/Document.h      |   4 ++
 src/Editor.cxx      |  18 ++++++++
 src/UniConversion.h |  13 ++++++
 6 files changed, 181 insertions(+), 8 deletions(-)

(limited to 'src')
diff --git a/src/CellBuffer.cxx b/src/CellBuffer.cxx
index 11b8b4acd..7bb96ca76 100644
--- a/src/CellBuffer.cxx
+++ b/src/CellBuffer.cxx
@@ -16,6 +16,7 @@
 #include "SplitVector.h"
 #include "Partitioning.h"
 #include "CellBuffer.h"
+#include "UniConversion.h"
 
 #ifdef SCI_NAMESPACE
 using namespace Scintilla;
@@ -331,6 +332,7 @@ void UndoHistory::CompletedRedoStep() {
 
 CellBuffer::CellBuffer() {
 	readOnly = false;
+	utf8LineEnds = 0;
 	collectingUndo = true;
 }
 
@@ -458,6 +460,13 @@ void CellBuffer::Allocate(int newSize) {
 	style.ReAllocate(newSize);
 }
 
+void CellBuffer::SetLineEndTypes(int utf8LineEnds_) {
+	if (utf8LineEnds != utf8LineEnds_) {
+		utf8LineEnds = utf8LineEnds_;
+		ResetLineEnds();
+	}
+}
+
 void CellBuffer::SetPerLine(PerLine *pl) {
 	lv.SetPerLine(pl);
 }
@@ -501,11 +510,64 @@ void CellBuffer::RemoveLine(int line) {
 	lv.RemoveLine(line);
 }
 
+bool CellBuffer::UTF8LineEndOverlaps(int position) const {
+	unsigned char bytes[] = {
+		static_cast<unsigned char>(substance.ValueAt(position-2)),
+		static_cast<unsigned char>(substance.ValueAt(position-1)),
+		static_cast<unsigned char>(substance.ValueAt(position)),
+		static_cast<unsigned char>(substance.ValueAt(position+1)),
+	};
+	return UTF8IsSeparator(bytes) || UTF8IsSeparator(bytes+1) || UTF8IsNEL(bytes+1);
+}
+
+void CellBuffer::ResetLineEnds() {
+	// Reinitialize line data -- too much work to preserve
+	lv.Init();
+
+	int position = 0;
+	int length = Length();
+	int lineInsert = 1;
+	bool atLineStart = true;
+	lv.InsertText(lineInsert-1, length);
+	unsigned char chBeforePrev = 0;
+	unsigned char chPrev = 0;
+	unsigned char ch = ' ';
+	for (int i = 0; i < length; i++) {
+		ch = substance.ValueAt(position + i);
+		if (ch == '\r') {
+			InsertLine(lineInsert, (position + i) + 1, atLineStart);
+			lineInsert++;
+		} else if (ch == '\n') {
+			if (chPrev == '\r') {
+				// Patch up what was end of line
+				lv.SetLineStart(lineInsert - 1, (position + i) + 1);
+			} else {
+				InsertLine(lineInsert, (position + i) + 1, atLineStart);
+				lineInsert++;
+			}
+		} else if (utf8LineEnds) {
+			unsigned char back3[3] = {chBeforePrev, chPrev, ch};
+			if (UTF8IsSeparator(back3) || UTF8IsNEL(back3+1)) {
+				InsertLine(lineInsert, (position + i) + 1, atLineStart);
+				lineInsert++;
+			}
+		}
+		chBeforePrev = chPrev;
+		chPrev = ch;
+	}
+}
+
 void CellBuffer::BasicInsertString(int position, const char *s, int insertLength) {
 	if (insertLength == 0)
 		return;
 	PLATFORM_ASSERT(insertLength > 0);
 
+	unsigned char chAfter = substance.ValueAt(position);
+	bool breakingUTF8LineEnd = false;
+	if (utf8LineEnds && UTF8IsTrailByte(chAfter)) {
+		breakingUTF8LineEnd = UTF8LineEndOverlaps(position);
+	}
+
 	substance.InsertFromArray(position, s, 0, insertLength);
 	style.InsertValue(position, insertLength, 0);
 
@@ -513,14 +575,17 @@ void CellBuffer::BasicInsertString(int position, const char *s, int insertLength
 	bool atLineStart = lv.LineStart(lineInsert-1) == position;
 	// Point all the lines after the insertion point further along in the buffer
 	lv.InsertText(lineInsert-1, insertLength);
-	char chPrev = substance.ValueAt(position - 1);
-	char chAfter = substance.ValueAt(position + insertLength);
+	unsigned char chBeforePrev = substance.ValueAt(position - 2);
+	unsigned char chPrev = substance.ValueAt(position - 1);
 	if (chPrev == '\r' && chAfter == '\n') {
 		// Splitting up a crlf pair at position
 		InsertLine(lineInsert, position, false);
 		lineInsert++;
 	}
-	char ch = ' ';
+	if (breakingUTF8LineEnd) {
+		RemoveLine(lineInsert);
+	}
+	unsigned char ch = ' ';
 	for (int i = 0; i < insertLength; i++) {
 		ch = s[i];
 		if (ch == '\r') {
@@ -534,7 +599,14 @@ void CellBuffer::BasicInsertString(int position, const char *s, int insertLength
 				InsertLine(lineInsert, (position + i) + 1, atLineStart);
 				lineInsert++;
 			}
+		} else if (utf8LineEnds) {
+			unsigned char back3[3] = {chBeforePrev, chPrev, ch};
+			if (UTF8IsSeparator(back3) || UTF8IsNEL(back3+1)) {
+				InsertLine(lineInsert, (position + i) + 1, atLineStart);
+				lineInsert++;
+			}
 		}
+		chBeforePrev = chPrev;
 		chPrev = ch;
 	}
 	// Joining two lines where last insertion is cr and following substance starts with lf
@@ -543,6 +615,22 @@ void CellBuffer::BasicInsertString(int position, const char *s, int insertLength
 			// End of line already in buffer so drop the newly created one
 			RemoveLine(lineInsert - 1);
 		}
+	} else if (utf8LineEnds && !UTF8IsAscii(chAfter)) {
+		// May have end of UTF-8 line end in buffer and start in insertion
+		for (int j = 0; j < UTF8SeparatorLength-1; j++) {
+			unsigned char chAt = substance.ValueAt(position + insertLength + j);
+			unsigned char back3[3] = {chBeforePrev, chPrev, chAt};
+			if (UTF8IsSeparator(back3)) {
+				InsertLine(lineInsert, (position + insertLength + j) + 1, atLineStart);
+				lineInsert++;
+			}
+			if ((j == 0) && UTF8IsNEL(back3+1)) {
+				InsertLine(lineInsert, (position + insertLength + j) + 1, atLineStart);
+				lineInsert++;
+			}
+			chBeforePrev = chPrev;
+			chPrev = chAt;
+		}
 	}
 }
 
@@ -560,9 +648,9 @@ void CellBuffer::BasicDeleteChars(int position, int deleteLength) {
 
 		int lineRemove = lv.LineFromPosition(position) + 1;
 		lv.InsertText(lineRemove-1, - (deleteLength));
-		char chPrev = substance.ValueAt(position - 1);
-		char chBefore = chPrev;
-		char chNext = substance.ValueAt(position);
+		unsigned char chPrev = substance.ValueAt(position - 1);
+		unsigned char chBefore = chPrev;
+		unsigned char chNext = substance.ValueAt(position);
 		bool ignoreNL = false;
 		if (chPrev == '\r' && chNext == '\n') {
 			// Move back one
@@ -570,8 +658,13 @@ void CellBuffer::BasicDeleteChars(int position, int deleteLength) {
 			lineRemove++;
 			ignoreNL = true; 	// First \n is not real deletion
 		}
+		if (utf8LineEnds && UTF8IsTrailByte(chNext)) {
+			if (UTF8LineEndOverlaps(position)) {
+				RemoveLine(lineRemove);
+			}
+		}
 
-		char ch = chNext;
+		unsigned char ch = chNext;
 		for (int i = 0; i < deleteLength; i++) {
 			chNext = substance.ValueAt(position + i + 1);
 			if (ch == '\r') {
@@ -584,6 +677,14 @@ void CellBuffer::BasicDeleteChars(int position, int deleteLength) {
 				} else {
 					RemoveLine(lineRemove);
 				}
+			} else if (utf8LineEnds) {
+				if (!UTF8IsAscii(ch)) {
+					unsigned char next3[3] = {ch, chNext,
+						static_cast<unsigned char>(substance.ValueAt(position + i + 2))};
+					if (UTF8IsSeparator(next3) || UTF8IsNEL(next3)) {
+						RemoveLine(lineRemove);
+					}
+				}
 			}
 
 			ch = chNext;
diff --git a/src/CellBuffer.h b/src/CellBuffer.h
index 388b9027b..bfbb121de 100644
--- a/src/CellBuffer.h
+++ b/src/CellBuffer.h
@@ -136,12 +136,15 @@ private:
 	SplitVector<char> substance;
 	SplitVector<char> style;
 	bool readOnly;
+	int utf8LineEnds;
 
 	bool collectingUndo;
 	UndoHistory uh;
 
 	LineVector lv;
 
+	bool UTF8LineEndOverlaps(int position) const;
+	void ResetLineEnds();
 	/// Actions without undo
 	void BasicInsertString(int position, const char *s, int insertLength);
 	void BasicDeleteChars(int position, int deleteLength);
@@ -162,6 +165,8 @@ public:
 
 	int Length() const;
 	void Allocate(int newSize);
+	int GetLineEndTypes() const { return utf8LineEnds; }
+	void SetLineEndTypes(int utf8LineEnds_);
 	void SetPerLine(PerLine *pl);
 	int Lines() const;
 	int LineStart(int line) const;
diff --git a/src/Document.cxx b/src/Document.cxx
index 2036f383c..b75c754ac 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -87,6 +87,7 @@ Document::Document() {
 	eolMode = SC_EOL_LF;
 #endif
 	dbcsCodePage = 0;
+	lineEndBitSet = SC_LINE_END_TYPE_DEFAULT;
 	stylingBits = 5;
 	stylingBitsMask = 0x1F;
 	stylingMask = 0;
@@ -157,12 +158,29 @@ bool Document::SetDBCSCodePage(int dbcsCodePage_) {
 	if (dbcsCodePage != dbcsCodePage_) {
 		dbcsCodePage = dbcsCodePage_;
 		SetCaseFolder(NULL);
+		cb.SetLineEndTypes(lineEndBitSet & LineEndTypesSupported());
 		return true;
 	} else {
 		return false;
 	}
 }
 
+bool Document::SetLineEndTypesAllowed(int lineEndBitSet_) {
+	if (lineEndBitSet != lineEndBitSet_) {
+		lineEndBitSet = lineEndBitSet_;
+		int lineEndBitSetActive = lineEndBitSet & LineEndTypesSupported();
+		if (lineEndBitSetActive != cb.GetLineEndTypes()) {
+			ModifiedAt(0);
+			cb.SetLineEndTypes(lineEndBitSetActive);
+			return true;
+		} else {
+			return false;
+		}
+	} else {
+		return false;
+	}
+}
+
 void Document::InsertLine(int line) {
 	for (int j=0; j<ldSize; j++) {
 		if (perLineData[j])
@@ -267,7 +285,21 @@ int SCI_METHOD Document::LineEnd(int line) const {
 	if (line == LinesTotal() - 1) {
 		return LineStart(line + 1);
 	} else {
-		int position = LineStart(line + 1) - 1;
+		int position = LineStart(line + 1);
+		if (SC_CP_UTF8 == dbcsCodePage) {
+			unsigned char bytes[] = {
+				static_cast<unsigned char>(cb.CharAt(position-3)),
+				static_cast<unsigned char>(cb.CharAt(position-2)),
+				static_cast<unsigned char>(cb.CharAt(position-1)),
+			};
+			if (UTF8IsSeparator(bytes)) {
+				return position - UTF8SeparatorLength;
+			}
+			if (UTF8IsNEL(bytes+1)) {
+				return position - UTF8NELLength;
+			}
+		}
+		position--; // Back over CR or LF
 		// When line terminator is CR+LF, may need to go back one more
 		if ((position > LineStart(line)) && (cb.CharAt(position - 1) == '\r')) {
 			position--;
diff --git a/src/Document.h b/src/Document.h
index 592d2ecb9..16804d3a1 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -241,6 +241,7 @@ public:
 	int eolMode;
 	/// Can also be SC_CP_UTF8 to enable UTF-8 mode
 	int dbcsCodePage;
+	int lineEndBitSet;
 	int tabInChars;
 	int indentInChars;
 	int actualIndentInChars;
@@ -259,6 +260,9 @@ public:
 	virtual void Init();
 	int LineEndTypesSupported() const;
 	bool SetDBCSCodePage(int dbcsCodePage_);
+	int GetLineEndTypesAllowed() { return cb.GetLineEndTypes(); }
+	bool SetLineEndTypesAllowed(int lineEndBitSet_);
+	int GetLineEndTypesActive() { return cb.GetLineEndTypes(); }
 	virtual void InsertLine(int line);
 	virtual void RemoveLine(int line);
 
diff --git a/src/Editor.cxx b/src/Editor.cxx
index e4ae6060e..f150aa202 100644
--- a/src/Editor.cxx
+++ b/src/Editor.cxx
@@ -7784,6 +7784,21 @@ sptr_t Editor::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam) {
 		pdoc->eolMode = wParam;
 		break;
 
+	case SCI_SETLINEENDTYPESALLOWED:
+		if (pdoc->SetLineEndTypesAllowed(wParam)) {
+			cs.Clear();
+			cs.InsertLines(0, pdoc->LinesTotal() - 1);
+			SetAnnotationHeights(0, pdoc->LinesTotal());
+			InvalidateStyleRedraw();
+		}
+		break;
+
+	case SCI_GETLINEENDTYPESALLOWED:
+		return pdoc->GetLineEndTypesAllowed();
+		
+	case SCI_GETLINEENDTYPESACTIVE:
+		return pdoc->GetLineEndTypesActive();
+		
 	case SCI_STARTSTYLING:
 		pdoc->StartStyling(wParam, static_cast<char>(lParam));
 		break;
@@ -8080,6 +8095,9 @@ sptr_t Editor::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam) {
 	case SCI_SETCODEPAGE:
 		if (ValidCodePage(wParam)) {
 			if (pdoc->SetDBCSCodePage(wParam)) {
+				cs.Clear();
+				cs.InsertLines(0, pdoc->LinesTotal() - 1);
+				SetAnnotationHeights(0, pdoc->LinesTotal());
 				InvalidateStyleRedraw();
 			}
 		}
diff --git a/src/UniConversion.h b/src/UniConversion.h
index 704f16239..70e8a9517 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -26,3 +26,16 @@ inline bool UTF8IsAscii(int ch) {
 
 enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 };
 int UTF8Classify(const unsigned char *us, int len);
+
+// Line separator is U+2028 \xe2\x80\xa8
+// Paragraph separator is U+2029 \xe2\x80\xa9
+const int UTF8SeparatorLength = 3;
+inline bool UTF8IsSeparator(const unsigned char *us) {
+	return (us[0] == 0xe2) && (us[1] == 0x80) && ((us[2] == 0xa8) || (us[2] == 0xa9));
+}
+
+// NEL is U+0085 \xc2\x85
+const int UTF8NELLength = 2;
+inline bool UTF8IsNEL(const unsigned char *us) {
+	return (us[0] == 0xc2) && (us[1] == 0x85);
+}
-- 
cgit v1.2.3