Optional indexing of line starts in UTF-8 documents by UTF-32 code points and

UTF-16 code units added.
author: Neil <nyamatongwe@gmail.com> 2018-07-10 15:06:50 +1000
committer: Neil <nyamatongwe@gmail.com> 2018-07-10 15:06:50 +1000
commit: 56e20ea0283d8018dee48d736ba9dfef3c84dc3f (patch)
tree: 21bdb500dfc092fadecb123b87e9799a2c46f6a9 /src
parent: d27cbe587930d13d3f1802b271d0d13e7e3c6e38 (diff)
download: scintilla-mirror-56e20ea0283d8018dee48d736ba9dfef3c84dc3f.tar.gz
7 files changed, 410 insertions, 6 deletions
diff --git a/src/CellBuffer.cxx b/src/CellBuffer.cxx
index e8c385f1f..ffe5fe8b3 100644
--- a/src/CellBuffer.cxx
+++ b/src/CellBuffer.cxx
@@ -7,6 +7,7 @@
 
 #include <cstddef>
 #include <cstdlib>
+#include <cassert>
 #include <cstring>
 #include <cstdio>
 #include <cstdarg>
@@ -28,17 +29,53 @@
 
 namespace Scintilla {
 
+struct CountWidths {
+	// Measures the number of characters in a string divided into those
+	// from the Base Multilingual Plane and those from other planes.
+	Sci::Position countBasePlane;
+	Sci::Position countOtherPlanes;
+	CountWidths(Sci::Position countBasePlane_=0, Sci::Position countOtherPlanes_=0) noexcept :
+		countBasePlane(countBasePlane_),
+		countOtherPlanes(countOtherPlanes_) {
+	}
+	CountWidths operator-() const noexcept {
+		return CountWidths(-countBasePlane , -countOtherPlanes);
+	}
+	Sci::Position WidthUTF32() const noexcept {
+		// All code points take one code unit in UTF-32.
+		return countBasePlane + countOtherPlanes;
+	}
+	Sci::Position WidthUTF16() const noexcept {
+		// UTF-16 takes 2 code units for other planes
+		return countBasePlane + 2 * countOtherPlanes;
+	}
+	void CountChar(int lenChar) noexcept {
+		if (lenChar == 4) {
+			countOtherPlanes++;
+		} else {
+			countBasePlane++;
+		}
+	}
+};
+
 class ILineVector {
 public:
 	virtual void Init() = 0;
 	virtual void SetPerLine(PerLine *pl) = 0;
 	virtual void InsertText(Sci::Line line, Sci::Position delta) = 0;
 	virtual void InsertLine(Sci::Line line, Sci::Position position, bool lineStart) = 0;
-	virtual void SetLineStart(Sci::Line line, Sci::Position position) = 0;
+	virtual void SetLineStart(Sci::Line line, Sci::Position position) noexcept = 0;
 	virtual void RemoveLine(Sci::Line line) = 0;
 	virtual Sci::Line Lines() const noexcept = 0;
 	virtual Sci::Line LineFromPosition(Sci::Position pos) const noexcept = 0;
 	virtual Sci::Position LineStart(Sci::Line line) const noexcept = 0;
+	virtual void InsertCharacters(Sci::Line line, CountWidths delta) = 0;
+	virtual void SetLineCharactersWidth(Sci::Line line, CountWidths width) = 0;
+	virtual int LineCharacterIndex() const noexcept = 0;
+	virtual bool AllocateLineCharacterIndex(int lineCharacterIndex, Sci::Line lines) = 0;
+	virtual bool ReleaseLineCharacterIndex(int lineCharacterIndex) = 0;
+	virtual Sci::Position IndexLineStart(Sci::Line line, int lineCharacterIndex) const noexcept = 0;
+	virtual Sci::Line LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const noexcept = 0;
 	virtual ~ILineVector() {}
 };
 
@@ -47,9 +84,58 @@ public:
 using namespace Scintilla;
 
 template <typename POS>
+class LineStartIndex {
+public:
+	int refCount;
+	Partitioning<POS> starts;
+
+	LineStartIndex() : refCount(0), starts(4) {
+		// Minimal initial allocation
+	}
+	// Deleted so LineStartIndex objects can not be copied.
+	LineStartIndex(const LineStartIndex &) = delete;
+	LineStartIndex(LineStartIndex &&) = delete;
+	void operator=(const LineStartIndex &) = delete;
+	void operator=(LineStartIndex &&) = delete;
+	virtual ~LineStartIndex() {
+		starts.DeleteAll();
+	}
+	bool Allocate(Sci::Line lines) {
+		refCount++;
+		Sci::Position length = starts.PositionFromPartition(starts.Partitions());
+		for (Sci::Line line = starts.Partitions(); line < lines; line++) {
+			// Produce an ascending sequence that will be filled in with correct widths later
+			length++;
+			starts.InsertPartition(static_cast<POS>(line), static_cast<POS>(length));
+		}
+		return refCount == 1;
+	}
+	bool Release() {
+		if (refCount == 1) {
+			starts.DeleteAll();
+		}
+		refCount--;
+		return refCount == 0;
+	}
+	bool Active() const noexcept {
+		return refCount > 0;
+	}
+	Sci::Position LineWidth(Sci::Line line) const noexcept {
+		return starts.PositionFromPartition(static_cast<POS>(line) + 1) -
+			starts.PositionFromPartition(static_cast<POS>(line));
+	}
+	void SetLineWidth(Sci::Line line, Sci::Position width) {
+		const Sci::Position widthCurrent = LineWidth(line);
+		starts.InsertText(static_cast<POS>(line), static_cast<POS>(width - widthCurrent));
+	}
+};
+
+template <typename POS>
 class LineVector : public ILineVector {
 	Partitioning<POS> starts;
 	PerLine *perLine;
+	LineStartIndex<POS> startsUTF16;
+	LineStartIndex<POS> startsUTF32;
 public:
 	LineVector() : starts(256), perLine(0) {
 		Init();
@@ -66,7 +152,9 @@ public:
 		if (perLine) {
 			perLine->Init();
 		}
- 	}
+		startsUTF32.starts.DeleteAll();
+		startsUTF16.starts.DeleteAll();
+	}
 	void SetPerLine(PerLine *pl) override {
 		perLine = pl;
 	}
@@ -74,18 +162,33 @@ public:
 		starts.InsertText(static_cast<POS>(line), static_cast<POS>(delta));
 	}
 	void InsertLine(Sci::Line line, Sci::Position position, bool lineStart) override {
-		starts.InsertPartition(static_cast<POS>(line), static_cast<POS>(position));
+		const POS lineAsPos = static_cast<POS>(line);
+		starts.InsertPartition(lineAsPos, static_cast<POS>(position));
+		if (startsUTF32.Active()) {
+			startsUTF32.starts.InsertPartition(lineAsPos,
+				static_cast<POS>(startsUTF32.starts.PositionFromPartition(lineAsPos - 1) + 1));
+		}
+		if (startsUTF16.Active()) {
+			startsUTF16.starts.InsertPartition(lineAsPos,
+				static_cast<POS>(startsUTF16.starts.PositionFromPartition(lineAsPos - 1) + 1));
+		}
 		if (perLine) {
 			if ((line > 0) && lineStart)
 				line--;
 			perLine->InsertLine(line);
 		}
 	}
-	void SetLineStart(Sci::Line line, Sci::Position position) override {
+	void SetLineStart(Sci::Line line, Sci::Position position) noexcept override {
 		starts.SetPartitionStartPosition(static_cast<POS>(line), static_cast<POS>(position));
 	}
 	void RemoveLine(Sci::Line line) override {
 		starts.RemovePartition(static_cast<POS>(line));
+		if (startsUTF32.Active()) {
+			startsUTF32.starts.RemovePartition(static_cast<POS>(line));
+		}
+		if (startsUTF16.Active()) {
+			startsUTF16.starts.RemovePartition(static_cast<POS>(line));
+		}
 		if (perLine) {
 			perLine->RemoveLine(line);
 		}
@@ -99,6 +202,71 @@ public:
 	Sci::Position LineStart(Sci::Line line) const noexcept override {
 		return starts.PositionFromPartition(static_cast<POS>(line));
 	}
+	void InsertCharacters(Sci::Line line, CountWidths delta) override {
+		if (startsUTF32.Active()) {
+			startsUTF32.starts.InsertText(static_cast<POS>(line), static_cast<POS>(delta.WidthUTF32()));
+		}
+		if (startsUTF16.Active()) {
+			startsUTF16.starts.InsertText(static_cast<POS>(line), static_cast<POS>(delta.WidthUTF16()));
+		}
+	}
+	void SetLineCharactersWidth(Sci::Line line, CountWidths width) override {
+		if (startsUTF32.Active()) {
+			assert(startsUTF32.starts.Partitions() == starts.Partitions());
+			startsUTF32.SetLineWidth(line, width.WidthUTF32());
+		}
+		if (startsUTF16.Active()) {
+			assert(startsUTF16.starts.Partitions() == starts.Partitions());
+			startsUTF16.SetLineWidth(line, width.WidthUTF16());
+		}
+	}
+
+	int LineCharacterIndex() const noexcept override {
+		int retVal = 0;
+		if (startsUTF32.Active()) {
+			retVal |= SC_LINECHARACTERINDEX_UTF32;
+		}
+		if (startsUTF16.Active()) {
+			retVal |= SC_LINECHARACTERINDEX_UTF16;
+		}
+		return retVal;
+	}
+	bool AllocateLineCharacterIndex(int lineCharacterIndex, Sci::Line lines) override {
+		bool changed = false;
+		if ((lineCharacterIndex & SC_LINECHARACTERINDEX_UTF32) != 0) {
+			changed = startsUTF32.Allocate(lines) || changed;
+			assert(startsUTF32.starts.Partitions() == starts.Partitions());
+		}
+		if ((lineCharacterIndex & SC_LINECHARACTERINDEX_UTF16) != 0) {
+			changed = startsUTF16.Allocate(lines) || changed;
+			assert(startsUTF16.starts.Partitions() == starts.Partitions());
+		}
+		return changed;
+	}
+	bool ReleaseLineCharacterIndex(int lineCharacterIndex) override {
+		bool changed = false;
+		if ((lineCharacterIndex & SC_LINECHARACTERINDEX_UTF32) != 0) {
+			changed = startsUTF32.Release() || changed;
+		}
+		if ((lineCharacterIndex & SC_LINECHARACTERINDEX_UTF16) != 0) {
+			changed = startsUTF16.Release() || changed;
+		}
+		return changed;
+	}
+	Sci::Position IndexLineStart(Sci::Line line, int lineCharacterIndex) const noexcept override {
+		if (lineCharacterIndex == SC_LINECHARACTERINDEX_UTF32) {
+			return startsUTF32.starts.PositionFromPartition(static_cast<POS>(line));
+		} else {
+			return startsUTF16.starts.PositionFromPartition(static_cast<POS>(line));
+		}
+	}
+	Sci::Line LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const noexcept override {
+		if (lineCharacterIndex == SC_LINECHARACTERINDEX_UTF32) {
+			return static_cast<Sci::Line>(startsUTF32.starts.PartitionFromPosition(static_cast<POS>(pos)));
+		} else {
+			return static_cast<Sci::Line>(startsUTF16.starts.PartitionFromPosition(static_cast<POS>(pos)));
+		}
+	}
 };
 
 Action::Action() {
@@ -364,6 +532,7 @@ void UndoHistory::CompletedRedoStep() {
 CellBuffer::CellBuffer(bool hasStyles_, bool largeDocument_) :
 	hasStyles(hasStyles_), largeDocument(largeDocument_) {
 	readOnly = false;
+	utf8Substance = false;
 	utf8LineEnds = 0;
 	collectingUndo = true;
 	if (largeDocument)
@@ -505,10 +674,19 @@ void CellBuffer::Allocate(Sci::Position newSize) {
 	}
 }
 
+void CellBuffer::SetUTF8Substance(bool utf8Substance_) {
+	if (utf8Substance != utf8Substance_) {
+		utf8Substance = utf8Substance_;
+		ResetLineEnds();
+	}
+}
+
 void CellBuffer::SetLineEndTypes(int utf8LineEnds_) {
 	if (utf8LineEnds != utf8LineEnds_) {
+		const int indexes = plv->LineCharacterIndex();
 		utf8LineEnds = utf8LineEnds_;
 		ResetLineEnds();
+		AllocateLineCharacterIndex(indexes);
 	}
 }
 
@@ -535,6 +713,23 @@ void CellBuffer::SetPerLine(PerLine *pl) {
 	plv->SetPerLine(pl);
 }
 
+int CellBuffer::LineCharacterIndex() const noexcept {
+	return plv->LineCharacterIndex();
+}
+
+void CellBuffer::AllocateLineCharacterIndex(int lineCharacterIndex) {
+	if (utf8Substance) {
+		if (plv->AllocateLineCharacterIndex(lineCharacterIndex, Lines())) {
+			// Changed so recalculate whole file
+			RecalculateIndexLineStarts(0, Lines() - 1);
+		}
+	}
+}
+
+void CellBuffer::ReleaseLineCharacterIndex(int lineCharacterIndex) {
+	plv->ReleaseLineCharacterIndex(lineCharacterIndex);
+}
+
 Sci::Line CellBuffer::Lines() const noexcept {
 	return plv->Lines();
 }
@@ -552,6 +747,14 @@ Sci::Line CellBuffer::LineFromPosition(Sci::Position pos) const noexcept {
 	return plv->LineFromPosition(pos);
 }
 
+Sci::Position CellBuffer::IndexLineStart(Sci::Line line, int lineCharacterIndex) const noexcept {
+	return plv->IndexLineStart(line, lineCharacterIndex);
+}
+
+Sci::Line CellBuffer::LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const noexcept {
+	return plv->LineFromPositionIndex(pos, lineCharacterIndex);
+}
+
 bool CellBuffer::IsReadOnly() const {
 	return readOnly;
 }
@@ -612,6 +815,37 @@ bool CellBuffer::UTF8LineEndOverlaps(Sci::Position position) const {
 	return UTF8IsSeparator(bytes) || UTF8IsSeparator(bytes+1) || UTF8IsNEL(bytes+1);
 }
 
+bool CellBuffer::UTF8IsCharacterBoundary(Sci::Position position) const {
+	assert(position >= 0 && position <= Length());
+	if (position > 0) {
+		std::string back;
+		for (int i = 0; i < UTF8MaxBytes; i++) {
+			const Sci::Position posBack = position - i;
+			if (posBack < 0) {
+				return false;
+			}
+			back.insert(0, 1, substance.ValueAt(posBack));
+			if (!UTF8IsTrailByte(back.front())) {
+				if (i > 0) {
+					// Have reached a non-trail
+					const int cla = UTF8Classify(back);
+					if ((cla & UTF8MaskInvalid) || (cla != i)) {
+						return false;
+					}
+				}
+				break;
+			}
+		}
+	}
+	if (position < Length()) {
+		const unsigned char fore = substance.ValueAt(position);
+		if (UTF8IsTrailByte(fore)) {
+			return false;
+		}
+	}
+	return true;
+}
+
 void CellBuffer::ResetLineEnds() {
 	// Reinitialize line data -- too much work to preserve
 	plv->Init();
@@ -648,6 +882,38 @@ void CellBuffer::ResetLineEnds() {
 	}
 }
 
+namespace {
+
+CountWidths CountCharacterWidthsUTF8(std::string_view sv) noexcept {
+	CountWidths cw;
+	size_t remaining = sv.length();
+	while (remaining > 0) {
+		const int utf8Status = UTF8Classify(sv);
+		const int lenChar = utf8Status & UTF8MaskWidth;
+		cw.CountChar(lenChar);
+		sv.remove_prefix(lenChar);
+		remaining -= lenChar;
+	}
+	return cw;
+}
+
+}
+
+void CellBuffer::RecalculateIndexLineStarts(Sci::Line lineFirst, Sci::Line lineLast) {
+	std::string text;
+	Sci::Position posLineEnd = LineStart(lineFirst);
+	for (Sci::Line line = lineFirst; line <= lineLast; line++) {
+		// Find line start and end, retrieve text of line, count characters and update line width
+		const Sci::Position posLineStart = posLineEnd;
+		posLineEnd = LineStart(line+1);
+		const Sci::Position width = posLineEnd - posLineStart;
+		text.resize(width);
+		GetCharRange(text.data(), posLineStart, width);
+		const CountWidths cw = CountCharacterWidthsUTF8(text);
+		plv->SetLineCharactersWidth(line, cw);
+	}
+}
+
 void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::Position insertLength) {
 	if (insertLength == 0)
 		return;
@@ -659,12 +925,25 @@ void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::P
 		breakingUTF8LineEnd = UTF8LineEndOverlaps(position);
 	}
 
+	const Sci::Line linePosition = plv->LineFromPosition(position);
+	Sci::Line lineInsert = linePosition + 1;
+
+	// A simple insertion is one that inserts valid text on a single line at a character boundary
+	bool simpleInsertion = false;
+
+	// Check for breaking apart a UTF-8 sequence and inserting invalid UTF-8
+	if (utf8Substance && (plv->LineCharacterIndex() != SC_LINECHARACTERINDEX_NONE)) {
+		// Actually, don't need to check that whole insertion is valid just that there
+		// are no potential fragments at ends.
+		simpleInsertion = UTF8IsCharacterBoundary(position) &&
+			UTF8IsValid(std::string_view(s, insertLength));
+	}
+
 	substance.InsertFromArray(position, s, 0, insertLength);
 	if (hasStyles) {
 		style.InsertValue(position, insertLength, 0);
 	}
 
-	Sci::Line lineInsert = plv->LineFromPosition(position) + 1;
 	const bool atLineStart = plv->LineStart(lineInsert-1) == position;
 	// Point all the lines after the insertion point further along in the buffer
 	plv->InsertText(lineInsert-1, insertLength);
@@ -684,6 +963,7 @@ void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::P
 		if (ch == '\r') {
 			InsertLine(lineInsert, (position + i) + 1, atLineStart);
 			lineInsert++;
+			simpleInsertion = false;
 		} else if (ch == '\n') {
 			if (chPrev == '\r') {
 				// Patch up what was end of line
@@ -692,11 +972,13 @@ void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::P
 				InsertLine(lineInsert, (position + i) + 1, atLineStart);
 				lineInsert++;
 			}
+			simpleInsertion = false;
 		} else if (utf8LineEnds) {
 			const unsigned char back3[3] = {chBeforePrev, chPrev, ch};
 			if (UTF8IsSeparator(back3) || UTF8IsNEL(back3+1)) {
 				InsertLine(lineInsert, (position + i) + 1, atLineStart);
 				lineInsert++;
+				simpleInsertion = false;
 			}
 		}
 		chBeforePrev = chPrev;
@@ -707,6 +989,7 @@ void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::P
 		if (ch == '\r') {
 			// End of line already in buffer so drop the newly created one
 			RemoveLine(lineInsert - 1);
+			simpleInsertion = false;
 		}
 	} else if (utf8LineEnds && !UTF8IsAscii(chAfter)) {
 		// May have end of UTF-8 line end in buffer and start in insertion
@@ -716,21 +999,31 @@ void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::P
 			if (UTF8IsSeparator(back3)) {
 				InsertLine(lineInsert, (position + insertLength + j) + 1, atLineStart);
 				lineInsert++;
+				simpleInsertion = false;
 			}
 			if ((j == 0) && UTF8IsNEL(back3+1)) {
 				InsertLine(lineInsert, (position + insertLength + j) + 1, atLineStart);
 				lineInsert++;
+				simpleInsertion = false;
 			}
 			chBeforePrev = chPrev;
 			chPrev = chAt;
 		}
 	}
+	if (simpleInsertion) {
+		const CountWidths cw = CountCharacterWidthsUTF8(std::string_view(s, insertLength));
+		plv->InsertCharacters(linePosition, cw);
+	} else {
+		RecalculateIndexLineStarts(linePosition, lineInsert - 1);
+	}
 }
 
 void CellBuffer::BasicDeleteChars(Sci::Position position, Sci::Position deleteLength) {
 	if (deleteLength == 0)
 		return;
 
+	Sci::Line lineRecalculateStart = INVALID_POSITION;
+
 	if ((position == 0) && (deleteLength == substance.Length())) {
 		// If whole buffer is being deleted, faster to reinitialise lines data
 		// than to delete each line.
@@ -739,11 +1032,37 @@ void CellBuffer::BasicDeleteChars(Sci::Position position, Sci::Position deleteLe
 		// Have to fix up line positions before doing deletion as looking at text in buffer
 		// to work out which lines have been removed
 
-		Sci::Line lineRemove = plv->LineFromPosition(position) + 1;
+		const Sci::Line linePosition = plv->LineFromPosition(position);
+		Sci::Line lineRemove = linePosition + 1;
+
 		plv->InsertText(lineRemove-1, - (deleteLength));
 		const unsigned char chPrev = substance.ValueAt(position - 1);
 		const unsigned char chBefore = chPrev;
 		unsigned char chNext = substance.ValueAt(position);
+
+		// Check for breaking apart a UTF-8 sequence
+		// Needs further checks that text is UTF-8 or that some other break apart is occurring
+		if (utf8Substance && (plv->LineCharacterIndex() != SC_LINECHARACTERINDEX_NONE)) {
+			const Sci::Position posEnd = position + deleteLength;
+			const Sci::Line lineEndRemove = plv->LineFromPosition(posEnd);
+			const bool simpleDeletion =
+				(linePosition == lineEndRemove) &&
+				UTF8IsCharacterBoundary(position) && UTF8IsCharacterBoundary(posEnd);
+			if (simpleDeletion) {
+				std::string text(deleteLength, '\0');
+				GetCharRange(text.data(), position, deleteLength);
+				if (UTF8IsValid(text)) {
+					// Everything is good
+					const CountWidths cw = CountCharacterWidthsUTF8(text);
+					plv->InsertCharacters(linePosition, -cw);
+				} else {
+					lineRecalculateStart = linePosition;
+				}
+			} else {
+				lineRecalculateStart = linePosition;
+			}
+		}
+
 		bool ignoreNL = false;
 		if (chPrev == '\r' && chNext == '\n') {
 			// Move back one
@@ -792,6 +1111,9 @@ void CellBuffer::BasicDeleteChars(Sci::Position position, Sci::Position deleteLe
 		}
 	}
 	substance.DeleteRange(position, deleteLength);
+	if (lineRecalculateStart >= 0) {
+		RecalculateIndexLineStarts(lineRecalculateStart, lineRecalculateStart);
+	}
 	if (hasStyles) {
 		style.DeleteRange(position, deleteLength);
 	}
diff --git a/src/CellBuffer.h b/src/CellBuffer.h
index f360b2a23..b9f2406f1 100644
--- a/src/CellBuffer.h
+++ b/src/CellBuffer.h
@@ -113,6 +113,7 @@ private:
 	SplitVector<char> substance;
 	SplitVector<char> style;
 	bool readOnly;
+	bool utf8Substance;
 	int utf8LineEnds;
 
 	bool collectingUndo;
@@ -121,7 +122,9 @@ private:
 	std::unique_ptr<ILineVector> plv;
 
 	bool UTF8LineEndOverlaps(Sci::Position position) const;
+	bool UTF8IsCharacterBoundary(Sci::Position position) const;
 	void ResetLineEnds();
+	void RecalculateIndexLineStarts(Sci::Line lineFirst, Sci::Line lineLast);
 	/// Actions without undo
 	void BasicInsertString(Sci::Position position, const char *s, Sci::Position insertLength);
 	void BasicDeleteChars(Sci::Position position, Sci::Position deleteLength);
@@ -148,13 +151,19 @@ public:
 
 	Sci::Position Length() const noexcept;
 	void Allocate(Sci::Position newSize);
+	void SetUTF8Substance(bool utf8Substance_);
 	int GetLineEndTypes() const { return utf8LineEnds; }
 	void SetLineEndTypes(int utf8LineEnds_);
 	bool ContainsLineEnd(const char *s, Sci::Position length) const;
 	void SetPerLine(PerLine *pl);
+	int LineCharacterIndex() const noexcept;
+	void AllocateLineCharacterIndex(int lineCharacterIndex);
+	void ReleaseLineCharacterIndex(int lineCharacterIndex);
 	Sci::Line Lines() const noexcept;
 	Sci::Position LineStart(Sci::Line line) const noexcept;
+	Sci::Position IndexLineStart(Sci::Line line, int lineCharacterIndex) const noexcept;
 	Sci::Line LineFromPosition(Sci::Position pos) const noexcept;
+	Sci::Line LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const noexcept;
 	void InsertLine(Sci::Line line, Sci::Position position, bool lineStart);
 	void RemoveLine(Sci::Line line);
 	const char *InsertString(Sci::Position position, const char *s, Sci::Position insertLength, bool &startSequence);
diff --git a/src/Document.cxx b/src/Document.cxx
index f3d8557ac..e53663f3e 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -119,6 +119,7 @@ Document::Document(int options) :
 	decorations = DecorationListCreate(IsLarge());
 
 	cb.SetPerLine(this);
+	cb.SetUTF8Substance(SC_CP_UTF8 == dbcsCodePage);
 }
 
 Document::~Document() {
@@ -194,6 +195,7 @@ bool Document::SetDBCSCodePage(int dbcsCodePage_) {
 		dbcsCodePage = dbcsCodePage_;
 		SetCaseFolder(nullptr);
 		cb.SetLineEndTypes(lineEndBitSet & LineEndTypesSupported());
+		cb.SetUTF8Substance(SC_CP_UTF8 == dbcsCodePage);
 		return true;
 	} else {
 		return false;
@@ -420,6 +422,14 @@ Sci::Position Document::VCHomePosition(Sci::Position position) const {
 		return startText;
 }
 
+Sci::Position Document::IndexLineStart(Sci::Line line, int lineCharacterIndex) const {
+	return cb.IndexLineStart(line, lineCharacterIndex);
+}
+
+Sci::Line Document::LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const {
+	return cb.LineFromPositionIndex(pos, lineCharacterIndex);
+}
+
 int SCI_METHOD Document::SetLevel(Sci_Position line, int level) {
 	const int prev = Levels()->SetLevel(static_cast<Sci::Line>(line), level, LinesTotal());
 	if (prev != level) {
@@ -2105,6 +2115,18 @@ const char *Document::SubstituteByPosition(const char *text, Sci::Position *leng
 		return 0;
 }
 
+int Document::LineCharacterIndex() const {
+	return cb.LineCharacterIndex();
+}
+
+void Document::AllocateLineCharacterIndex(int lineCharacterIndex) {
+	return cb.AllocateLineCharacterIndex(lineCharacterIndex);
+}
+
+void Document::ReleaseLineCharacterIndex(int lineCharacterIndex) {
+	return cb.ReleaseLineCharacterIndex(lineCharacterIndex);
+}
+
 Sci::Line Document::LinesTotal() const noexcept {
 	return cb.Lines();
 }
diff --git a/src/Document.h b/src/Document.h
index e1613cb20..0ef967e09 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -389,6 +389,8 @@ public:
 	bool IsLineEndPosition(Sci::Position position) const;
 	bool IsPositionInLineEnd(Sci::Position position) const;
 	Sci::Position VCHomePosition(Sci::Position position) const;
+	Sci::Position IndexLineStart(Sci::Line line, int lineCharacterIndex) const;
+	Sci::Line LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const;
 
 	int SCI_METHOD SetLevel(Sci_Position line, int level) override;
 	int SCI_METHOD GetLevel(Sci_Position line) const override;
@@ -414,6 +416,9 @@ public:
 	void SetCaseFolder(CaseFolder *pcf_);
 	Sci::Position FindText(Sci::Position minPos, Sci::Position maxPos, const char *search, int flags, Sci::Position *length);
 	const char *SubstituteByPosition(const char *text, Sci::Position *length);
+	int LineCharacterIndex() const;
+	void AllocateLineCharacterIndex(int lineCharacterIndex);
+	void ReleaseLineCharacterIndex(int lineCharacterIndex);
 	Sci::Line LinesTotal() const noexcept;
 
 	void SetDefaultCharClasses(bool includeWordClass);
diff --git a/src/Editor.cxx b/src/Editor.cxx
index 86c0536a1..3093e6c57 100644
--- a/src/Editor.cxx
+++ b/src/Editor.cxx
@@ -6020,6 +6020,11 @@ sptr_t Editor::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam) {
 			static_cast<Sci::Position>(wParam), lParam),
 			0, pdoc->Length());
 
+	case SCI_POSITIONRELATIVECODEUNITS:
+		return std::clamp<Sci::Position>(pdoc->GetRelativePositionUTF16(
+			static_cast<Sci::Position>(wParam), lParam),
+			0, pdoc->Length());
+
 	case SCI_LINESCROLL:
 		ScrollTo(topLine + static_cast<Sci::Line>(lParam));
 		HorizontalScrollTo(xOffset + static_cast<int>(wParam) * static_cast<int>(vs.spaceWidth));
@@ -6785,6 +6790,23 @@ sptr_t Editor::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam) {
 	case SCI_GETBIDIRECTIONAL:
 		return static_cast<sptr_t>(bidirectional);
 
+	case SCI_GETLINECHARACTERINDEX:
+		return pdoc->LineCharacterIndex();
+
+	case SCI_ALLOCATELINECHARACTERINDEX:
+		pdoc->AllocateLineCharacterIndex(static_cast<int>(wParam));
+		break;
+
+	case SCI_RELEASELINECHARACTERINDEX:
+		pdoc->ReleaseLineCharacterIndex(static_cast<int>(wParam));
+		break;
+
+	case SCI_LINEFROMINDEXPOSITION:
+		return pdoc->LineFromPositionIndex(static_cast<Sci::Position>(wParam), static_cast<int>(lParam));
+
+	case SCI_INDEXPOSITIONFROMLINE:
+		return pdoc->IndexLineStart(static_cast<Sci::Line>(wParam), static_cast<int>(lParam));
+
 		// Marker definition and setting
 	case SCI_MARKERDEFINE:
 		if (wParam <= MARKER_MAX) {
@@ -8190,6 +8212,10 @@ sptr_t Editor::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam) {
 
 	case SCI_COUNTCHARACTERS:
 		return pdoc->CountCharacters(static_cast<Sci::Position>(wParam), lParam);
+		//return pdoc->CountCharacters(static_cast<Sci::Position>(wParam), static_cast<Sci::Position>(lParam));
+
+	case SCI_COUNTCODEUNITS:
+		return pdoc->CountUTF16(static_cast<Sci::Position>(wParam), lParam);
 
 	default:
 		return DefWndProc(iMessage, wParam, lParam);
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx
index 3b7472638..58475687b 100644
--- a/src/UniConversion.cxx
+++ b/src/UniConversion.cxx
@@ -340,6 +340,22 @@ int UTF8DrawBytes(const unsigned char *us, int len) noexcept {
 	return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
 }
 
+bool UTF8IsValid(std::string_view sv) noexcept {
+	const unsigned char *us = reinterpret_cast<const unsigned char *>(sv.data());
+	size_t remaining = sv.length();
+	while (remaining > 0) {
+		const int utf8Status = UTF8Classify(us, remaining);
+		if (utf8Status & UTF8MaskInvalid) {
+			return false;
+		} else {
+			const int lenChar = utf8Status & UTF8MaskWidth;
+			us += lenChar;
+			remaining -= lenChar;
+		}
+	}
+	return remaining == 0;
+}
+
 // Replace invalid bytes in UTF-8 with the replacement character
 std::string FixInvalidUTF8(const std::string &text) {
 	std::string result;
diff --git a/src/UniConversion.h b/src/UniConversion.h
index 6d257cd8e..c676230da 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -22,6 +22,7 @@ size_t UTF16Length(std::string_view sv);
 size_t UTF16FromUTF8(std::string_view sv, wchar_t *tbuf, size_t tlen);
 size_t UTF32FromUTF8(std::string_view sv, unsigned int *tbuf, size_t tlen);
 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept;
+bool UTF8IsValid(std::string_view sv) noexcept;
 std::string FixInvalidUTF8(const std::string &text);
 
 extern const unsigned char UTF8BytesOfLead[256];
@@ -49,6 +50,9 @@ inline constexpr bool UTF8IsAscii(int ch) noexcept {
 
 enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 };
 int UTF8Classify(const unsigned char *us, size_t len) noexcept;
+inline int UTF8Classify(std::string_view sv) noexcept {
+	return UTF8Classify(reinterpret_cast<const unsigned char *>(sv.data()), sv.length());
+}
 
 // Similar to UTF8Classify but returns a length of 1 for invalid bytes
 // instead of setting the invalid flag
author	Neil <nyamatongwe@gmail.com>	2018-07-10 15:06:50 +1000
committer	Neil <nyamatongwe@gmail.com>	2018-07-10 15:06:50 +1000
commit	56e20ea0283d8018dee48d736ba9dfef3c84dc3f (patch)
tree	21bdb500dfc092fadecb123b87e9799a2c46f6a9 /src
parent	d27cbe587930d13d3f1802b271d0d13e7e3c6e38 (diff)
download	scintilla-mirror-56e20ea0283d8018dee48d736ba9dfef3c84dc3f.tar.gz