Support the three Unicode line ends NEL, LS, and PS in CellBuffer, Document,

Editor and the message interface. Will only be turned on for lexers that support Unicode line ends.
author: nyamatongwe <devnull@localhost> 2013-01-19 11:40:47 +1100
committer: nyamatongwe <devnull@localhost> 2013-01-19 11:40:47 +1100
commit: d6ac5bf56d40512ac0634d7a5bee6f7328b7d41f (patch)
tree: c8a0a61379695115cde7c7423ce4308f4c195336 /src
parent: 46ff1fe3d148b9d131788be6b4d7da8daa65189c (diff)
download: scintilla-mirror-d6ac5bf56d40512ac0634d7a5bee6f7328b7d41f.tar.gz
6 files changed, 181 insertions, 8 deletions
diff --git a/src/CellBuffer.cxx b/src/CellBuffer.cxx
index 11b8b4acd..7bb96ca76 100644
--- a/src/CellBuffer.cxx
+++ b/src/CellBuffer.cxx
@@ -16,6 +16,7 @@
 #include "SplitVector.h"
 #include "Partitioning.h"
 #include "CellBuffer.h"
+#include "UniConversion.h"
 
 #ifdef SCI_NAMESPACE
 using namespace Scintilla;
@@ -331,6 +332,7 @@ void UndoHistory::CompletedRedoStep() {
 
 CellBuffer::CellBuffer() {
 	readOnly = false;
+	utf8LineEnds = 0;
 	collectingUndo = true;
 }
 
@@ -458,6 +460,13 @@ void CellBuffer::Allocate(int newSize) {
 	style.ReAllocate(newSize);
 }
 
+void CellBuffer::SetLineEndTypes(int utf8LineEnds_) {
+	if (utf8LineEnds != utf8LineEnds_) {
+		utf8LineEnds = utf8LineEnds_;
+		ResetLineEnds();
+	}
+}
+
 void CellBuffer::SetPerLine(PerLine *pl) {
 	lv.SetPerLine(pl);
 }
@@ -501,11 +510,64 @@ void CellBuffer::RemoveLine(int line) {
 	lv.RemoveLine(line);
 }
 
+bool CellBuffer::UTF8LineEndOverlaps(int position) const {
+	unsigned char bytes[] = {
+		static_cast<unsigned char>(substance.ValueAt(position-2)),
+		static_cast<unsigned char>(substance.ValueAt(position-1)),
+		static_cast<unsigned char>(substance.ValueAt(position)),
+		static_cast<unsigned char>(substance.ValueAt(position+1)),
+	};
+	return UTF8IsSeparator(bytes) || UTF8IsSeparator(bytes+1) || UTF8IsNEL(bytes+1);
+}
+
+void CellBuffer::ResetLineEnds() {
+	// Reinitialize line data -- too much work to preserve
+	lv.Init();
+
+	int position = 0;
+	int length = Length();
+	int lineInsert = 1;
+	bool atLineStart = true;
+	lv.InsertText(lineInsert-1, length);
+	unsigned char chBeforePrev = 0;
+	unsigned char chPrev = 0;
+	unsigned char ch = ' ';
+	for (int i = 0; i < length; i++) {
+		ch = substance.ValueAt(position + i);
+		if (ch == '\r') {
+			InsertLine(lineInsert, (position + i) + 1, atLineStart);
+			lineInsert++;
+		} else if (ch == '\n') {
+			if (chPrev == '\r') {
+				// Patch up what was end of line
+				lv.SetLineStart(lineInsert - 1, (position + i) + 1);
+			} else {
+				InsertLine(lineInsert, (position + i) + 1, atLineStart);
+				lineInsert++;
+			}
+		} else if (utf8LineEnds) {
+			unsigned char back3[3] = {chBeforePrev, chPrev, ch};
+			if (UTF8IsSeparator(back3) || UTF8IsNEL(back3+1)) {
+				InsertLine(lineInsert, (position + i) + 1, atLineStart);
+				lineInsert++;
+			}
+		}
+		chBeforePrev = chPrev;
+		chPrev = ch;
+	}
+}
+
 void CellBuffer::BasicInsertString(int position, const char *s, int insertLength) {
 	if (insertLength == 0)
 		return;
 	PLATFORM_ASSERT(insertLength > 0);
 
+	unsigned char chAfter = substance.ValueAt(position);
+	bool breakingUTF8LineEnd = false;
+	if (utf8LineEnds && UTF8IsTrailByte(chAfter)) {
+		breakingUTF8LineEnd = UTF8LineEndOverlaps(position);
+	}
+
 	substance.InsertFromArray(position, s, 0, insertLength);
 	style.InsertValue(position, insertLength, 0);
 
@@ -513,14 +575,17 @@ void CellBuffer::BasicInsertString(int position, const char *s, int insertLength
 	bool atLineStart = lv.LineStart(lineInsert-1) == position;
 	// Point all the lines after the insertion point further along in the buffer
 	lv.InsertText(lineInsert-1, insertLength);
-	char chPrev = substance.ValueAt(position - 1);
-	char chAfter = substance.ValueAt(position + insertLength);
+	unsigned char chBeforePrev = substance.ValueAt(position - 2);
+	unsigned char chPrev = substance.ValueAt(position - 1);
 	if (chPrev == '\r' && chAfter == '\n') {
 		// Splitting up a crlf pair at position
 		InsertLine(lineInsert, position, false);
 		lineInsert++;
 	}
-	char ch = ' ';
+	if (breakingUTF8LineEnd) {
+		RemoveLine(lineInsert);
+	}
+	unsigned char ch = ' ';
 	for (int i = 0; i < insertLength; i++) {
 		ch = s[i];
 		if (ch == '\r') {
@@ -534,7 +599,14 @@ void CellBuffer::BasicInsertString(int position, const char *s, int insertLength
 				InsertLine(lineInsert, (position + i) + 1, atLineStart);
 				lineInsert++;
 			}
+		} else if (utf8LineEnds) {
+			unsigned char back3[3] = {chBeforePrev, chPrev, ch};
+			if (UTF8IsSeparator(back3) || UTF8IsNEL(back3+1)) {
+				InsertLine(lineInsert, (position + i) + 1, atLineStart);
+				lineInsert++;
+			}
 		}
+		chBeforePrev = chPrev;
 		chPrev = ch;
 	}
 	// Joining two lines where last insertion is cr and following substance starts with lf
@@ -543,6 +615,22 @@ void CellBuffer::BasicInsertString(int position, const char *s, int insertLength
 			// End of line already in buffer so drop the newly created one
 			RemoveLine(lineInsert - 1);
 		}
+	} else if (utf8LineEnds && !UTF8IsAscii(chAfter)) {
+		// May have end of UTF-8 line end in buffer and start in insertion
+		for (int j = 0; j < UTF8SeparatorLength-1; j++) {
+			unsigned char chAt = substance.ValueAt(position + insertLength + j);
+			unsigned char back3[3] = {chBeforePrev, chPrev, chAt};
+			if (UTF8IsSeparator(back3)) {
+				InsertLine(lineInsert, (position + insertLength + j) + 1, atLineStart);
+				lineInsert++;
+			}
+			if ((j == 0) && UTF8IsNEL(back3+1)) {
+				InsertLine(lineInsert, (position + insertLength + j) + 1, atLineStart);
+				lineInsert++;
+			}
+			chBeforePrev = chPrev;
+			chPrev = chAt;
+		}
 	}
 }
 
@@ -560,9 +648,9 @@ void CellBuffer::BasicDeleteChars(int position, int deleteLength) {
 
 		int lineRemove = lv.LineFromPosition(position) + 1;
 		lv.InsertText(lineRemove-1, - (deleteLength));
-		char chPrev = substance.ValueAt(position - 1);
-		char chBefore = chPrev;
-		char chNext = substance.ValueAt(position);
+		unsigned char chPrev = substance.ValueAt(position - 1);
+		unsigned char chBefore = chPrev;
+		unsigned char chNext = substance.ValueAt(position);
 		bool ignoreNL = false;
 		if (chPrev == '\r' && chNext == '\n') {
 			// Move back one
@@ -570,8 +658,13 @@ void CellBuffer::BasicDeleteChars(int position, int deleteLength) {
 			lineRemove++;
 			ignoreNL = true; 	// First \n is not real deletion
 		}
+		if (utf8LineEnds && UTF8IsTrailByte(chNext)) {
+			if (UTF8LineEndOverlaps(position)) {
+				RemoveLine(lineRemove);
+			}
+		}
 
-		char ch = chNext;
+		unsigned char ch = chNext;
 		for (int i = 0; i < deleteLength; i++) {
 			chNext = substance.ValueAt(position + i + 1);
 			if (ch == '\r') {
@@ -584,6 +677,14 @@ void CellBuffer::BasicDeleteChars(int position, int deleteLength) {
 				} else {
 					RemoveLine(lineRemove);
 				}
+			} else if (utf8LineEnds) {
+				if (!UTF8IsAscii(ch)) {
+					unsigned char next3[3] = {ch, chNext,
+						static_cast<unsigned char>(substance.ValueAt(position + i + 2))};
+					if (UTF8IsSeparator(next3) || UTF8IsNEL(next3)) {
+						RemoveLine(lineRemove);
+					}
+				}
 			}
 
 			ch = chNext;
diff --git a/src/CellBuffer.h b/src/CellBuffer.h
index 388b9027b..bfbb121de 100644
--- a/src/CellBuffer.h
+++ b/src/CellBuffer.h
@@ -136,12 +136,15 @@ private:
 	SplitVector<char> substance;
 	SplitVector<char> style;
 	bool readOnly;
+	int utf8LineEnds;
 
 	bool collectingUndo;
 	UndoHistory uh;
 
 	LineVector lv;
 
+	bool UTF8LineEndOverlaps(int position) const;
+	void ResetLineEnds();
 	/// Actions without undo
 	void BasicInsertString(int position, const char *s, int insertLength);
 	void BasicDeleteChars(int position, int deleteLength);
@@ -162,6 +165,8 @@ public:
 
 	int Length() const;
 	void Allocate(int newSize);
+	int GetLineEndTypes() const { return utf8LineEnds; }
+	void SetLineEndTypes(int utf8LineEnds_);
 	void SetPerLine(PerLine *pl);
 	int Lines() const;
 	int LineStart(int line) const;
diff --git a/src/Document.cxx b/src/Document.cxx
index 2036f383c..b75c754ac 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -87,6 +87,7 @@ Document::Document() {
 	eolMode = SC_EOL_LF;
 #endif
 	dbcsCodePage = 0;
+	lineEndBitSet = SC_LINE_END_TYPE_DEFAULT;
 	stylingBits = 5;
 	stylingBitsMask = 0x1F;
 	stylingMask = 0;
@@ -157,12 +158,29 @@ bool Document::SetDBCSCodePage(int dbcsCodePage_) {
 	if (dbcsCodePage != dbcsCodePage_) {
 		dbcsCodePage = dbcsCodePage_;
 		SetCaseFolder(NULL);
+		cb.SetLineEndTypes(lineEndBitSet & LineEndTypesSupported());
 		return true;
 	} else {
 		return false;
 	}
 }
 
+bool Document::SetLineEndTypesAllowed(int lineEndBitSet_) {
+	if (lineEndBitSet != lineEndBitSet_) {
+		lineEndBitSet = lineEndBitSet_;
+		int lineEndBitSetActive = lineEndBitSet & LineEndTypesSupported();
+		if (lineEndBitSetActive != cb.GetLineEndTypes()) {
+			ModifiedAt(0);
+			cb.SetLineEndTypes(lineEndBitSetActive);
+			return true;
+		} else {
+			return false;
+		}
+	} else {
+		return false;
+	}
+}
+
 void Document::InsertLine(int line) {
 	for (int j=0; j<ldSize; j++) {
 		if (perLineData[j])
@@ -267,7 +285,21 @@ int SCI_METHOD Document::LineEnd(int line) const {
 	if (line == LinesTotal() - 1) {
 		return LineStart(line + 1);
 	} else {
-		int position = LineStart(line + 1) - 1;
+		int position = LineStart(line + 1);
+		if (SC_CP_UTF8 == dbcsCodePage) {
+			unsigned char bytes[] = {
+				static_cast<unsigned char>(cb.CharAt(position-3)),
+				static_cast<unsigned char>(cb.CharAt(position-2)),
+				static_cast<unsigned char>(cb.CharAt(position-1)),
+			};
+			if (UTF8IsSeparator(bytes)) {
+				return position - UTF8SeparatorLength;
+			}
+			if (UTF8IsNEL(bytes+1)) {
+				return position - UTF8NELLength;
+			}
+		}
+		position--; // Back over CR or LF
 		// When line terminator is CR+LF, may need to go back one more
 		if ((position > LineStart(line)) && (cb.CharAt(position - 1) == '\r')) {
 			position--;
diff --git a/src/Document.h b/src/Document.h
index 592d2ecb9..16804d3a1 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -241,6 +241,7 @@ public:
 	int eolMode;
 	/// Can also be SC_CP_UTF8 to enable UTF-8 mode
 	int dbcsCodePage;
+	int lineEndBitSet;
 	int tabInChars;
 	int indentInChars;
 	int actualIndentInChars;
@@ -259,6 +260,9 @@ public:
 	virtual void Init();
 	int LineEndTypesSupported() const;
 	bool SetDBCSCodePage(int dbcsCodePage_);
+	int GetLineEndTypesAllowed() { return cb.GetLineEndTypes(); }
+	bool SetLineEndTypesAllowed(int lineEndBitSet_);
+	int GetLineEndTypesActive() { return cb.GetLineEndTypes(); }
 	virtual void InsertLine(int line);
 	virtual void RemoveLine(int line);
 
diff --git a/src/Editor.cxx b/src/Editor.cxx
index e4ae6060e..f150aa202 100644
--- a/src/Editor.cxx
+++ b/src/Editor.cxx
@@ -7784,6 +7784,21 @@ sptr_t Editor::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam) {
 		pdoc->eolMode = wParam;
 		break;
 
+	case SCI_SETLINEENDTYPESALLOWED:
+		if (pdoc->SetLineEndTypesAllowed(wParam)) {
+			cs.Clear();
+			cs.InsertLines(0, pdoc->LinesTotal() - 1);
+			SetAnnotationHeights(0, pdoc->LinesTotal());
+			InvalidateStyleRedraw();
+		}
+		break;
+
+	case SCI_GETLINEENDTYPESALLOWED:
+		return pdoc->GetLineEndTypesAllowed();
+		
+	case SCI_GETLINEENDTYPESACTIVE:
+		return pdoc->GetLineEndTypesActive();
+		
 	case SCI_STARTSTYLING:
 		pdoc->StartStyling(wParam, static_cast<char>(lParam));
 		break;
@@ -8080,6 +8095,9 @@ sptr_t Editor::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam) {
 	case SCI_SETCODEPAGE:
 		if (ValidCodePage(wParam)) {
 			if (pdoc->SetDBCSCodePage(wParam)) {
+				cs.Clear();
+				cs.InsertLines(0, pdoc->LinesTotal() - 1);
+				SetAnnotationHeights(0, pdoc->LinesTotal());
 				InvalidateStyleRedraw();
 			}
 		}
diff --git a/src/UniConversion.h b/src/UniConversion.h
index 704f16239..70e8a9517 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -26,3 +26,16 @@ inline bool UTF8IsAscii(int ch) {
 
 enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 };
 int UTF8Classify(const unsigned char *us, int len);
+
+// Line separator is U+2028 \xe2\x80\xa8
+// Paragraph separator is U+2029 \xe2\x80\xa9
+const int UTF8SeparatorLength = 3;
+inline bool UTF8IsSeparator(const unsigned char *us) {
+	return (us[0] == 0xe2) && (us[1] == 0x80) && ((us[2] == 0xa8) || (us[2] == 0xa9));
+}
+
+// NEL is U+0085 \xc2\x85
+const int UTF8NELLength = 2;
+inline bool UTF8IsNEL(const unsigned char *us) {
+	return (us[0] == 0xc2) && (us[1] == 0x85);
+}
author	nyamatongwe <devnull@localhost>	2013-01-19 11:40:47 +1100
committer	nyamatongwe <devnull@localhost>	2013-01-19 11:40:47 +1100
commit	d6ac5bf56d40512ac0634d7a5bee6f7328b7d41f (patch)
tree	c8a0a61379695115cde7c7423ce4308f4c195336 /src
parent	46ff1fe3d148b9d131788be6b4d7da8daa65189c (diff)
download	scintilla-mirror-d6ac5bf56d40512ac0634d7a5bee6f7328b7d41f.tar.gz