From 56e20ea0283d8018dee48d736ba9dfef3c84dc3f Mon Sep 17 00:00:00 2001
From: Neil
○ Selection and information
- ○ Multiple Selection and Virtual Space
+ ○ By character or UTF-16 code unit
- ○ Scrolling and automatic
+ ○ Multiple Selection and Virtual Space
scrolling
+
○ Scrolling and automatic
+
○ White space
○ Cursor
- ○ Mouse capture
-
+
○ Mouse capture
+
○ Line endings
○ Words
- ○ Styling
-
+
○ Styling
+
○ Style definition
○ Caret, selection, and hotspot styles
- ○ Character representations
-
+
○ Character representations
+
○ Margins
○ Annotations
- ○ Other settings
-
+
○ Other settings
+
○ Brace highlighting
○ Tabs and Indentation
Guides
- ○ Markers
-
+
○ Markers
+
○ Indicators
○ Autocompletion
- ○ User lists
-
+
○ User lists
+
○ Call tips
○ Keyboard commands
- ○ Key bindings
-
+
○ Key bindings
+
○ Popup edit menu
○ Macro recording
- ○ Printing
-
+
○ Printing
+
○ Direct access
○ Multiple views
- ○ Background loading and saving
-
+
○ Background loading and saving
+
○ Folding
○ Line wrapping
- ○ Zooming
-
+
○ Zooming
+
○ Long lines
○ Accessibility
- ○ Lexer
-
+
○ Lexer
+
○ Lexer objects
○ Notifications
- ○ Images
-
+
○ Images
+
○ GTK+
○ Provisional messages
- ○ Deprecated messages
-
+
+ ○ Deprecated messages
+
○ Edit messages never
supported by Scintilla
○ Removed features
+
@@ -1229,8 +1233,6 @@ struct Sci_TextToFind {
○ Building Scintilla
-
-
@@ -1448,15 +1450,6 @@ struct Sci_TextToFind {
If called with a position within a multi byte character will return the position
of the start/end of that character.
SCI_POSITIONRELATIVE(int pos, int relative) → position
- Count a number of whole characters before or after the argument position and return that position.
- The minimum position returned is 0 and the maximum is the last position in the document.
- If the position goes past the document end then 0 is returned.
-
SCI_COUNTCHARACTERS(int start, int end) → int
- Returns the number of whole characters between two positions..
SCI_TEXTWIDTH(int style, const char *text) → int
This returns the pixel width of a string drawn in the given style which can
be used, for example, to decide how wide to make the line number margin in order to display a
@@ -1527,6 +1520,61 @@ struct Sci_TextToFind {
When this option is turned off, mouse selections will always stick to the mode the selection was started in. It
is off by default.
Most Scintilla APIs use byte positions but some applications want to use positions based on counting + (UTF-32) characters or (UTF-16) code units + or need to communicate with other code written in terms of characters or code units. + With only byte positions, this may require examining many bytes to count characters or code units in the document + but this may be sped up in some cases by indexing the line starts by character or code unit.
+ +
+ SCI_POSITIONRELATIVE(int pos, int relative) → position
+ SCI_POSITIONRELATIVECODEUNITS(int pos, int relative) → position
+ SCI_COUNTCHARACTERS(int start, int end) → int
+ SCI_COUNTCODEUNITS(int start, int end) → int
+ SCI_GETLINECHARACTERINDEX → int
+ SCI_ALLOCATELINECHARACTERINDEX(int lineCharacterIndex)
+ SCI_RELEASELINECHARACTERINDEX(int lineCharacterIndex)
+ SCI_LINEFROMINDEXPOSITION(int pos, int lineCharacterIndex) → int
+ SCI_INDEXPOSITIONFROMLINE(int line, int lineCharacterIndex) → int
+
+
+ SCI_POSITIONRELATIVE(int pos, int relative) → position
+ Count a number of whole characters before or after the argument position and return that position.
+ The minimum position returned is 0 and the maximum is the last position in the document.
+ If the position goes past the document end then 0 is returned.
+
SCI_COUNTCHARACTERS(int start, int end) → int
+ Returns the number of whole characters between two positions.
SCI_POSITIONRELATIVECODEUNITS(int pos, int relative) → int
+ SCI_COUNTCODEUNITS(int start, int end) → int
+ These are the UTF-16 versions of SCI_POSITIONRELATIVE and SCI_COUNTCHARACTERS
+ working in terms of UTF-16 code units.
SCI_GETLINECHARACTERINDEX → int
+ Returns which if any indexes are active. It may be SC_LINECHARACTERINDEX_NONE(0) or one or more
+ of SC_LINECHARACTERINDEX_UTF32(1) if whole characters are indexed or
+ SC_LINECHARACTERINDEX_UTF16(2) if UTF-16 code units are indexed.
+ Character indexes are currently only supported for UTF-8 documents.
SCI_ALLOCATELINECHARACTERINDEX(int lineCharacterIndex)
+ SCI_RELEASELINECHARACTERINDEX(int lineCharacterIndex)
+ Allocate or release one or more indexes using same enumeration as SCI_GETLINECHARACTERINDEX.
+ Different aspects of an application may need indexes for different periods and should allocate for those periods.
+ Indexes use additional memory so releasing them can help minimize memory but they also take time to recalculate.
+ Scintilla may also allocate indexes to support features like accessibility or input method editors.
+ Only one index of each type is created for a document at a time.
SCI_LINEFROMINDEXPOSITION(int pos, int lineCharacterIndex) → int
+ SCI_INDEXPOSITIONFROMLINE(int line, int lineCharacterIndex) → int
+ The document line of a particular character or code unit may be found by calling SCI_LINEFROMINDEXPOSITION with one of
+ SC_LINECHARACTERINDEX_UTF32(1) or SC_LINECHARACTERINDEX_UTF16(2).
+ The inverse action, finds the starting position of a document line either in characters or code units from the document start by calling
+ SCI_INDEXPOSITIONFROMLINE with the same lineCharacterIndex argument.
diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html
index 70895e649..365bb0ed0 100644
--- a/doc/ScintillaHistory.html
+++ b/doc/ScintillaHistory.html
@@ -550,6 +550,11 @@
Released 19 June 2018.
+ Optional indexing of line starts in UTF-8 documents by UTF-32 code points and UTF-16 code units added.
+ This can improve performance for clients that provide UTF-32 or UTF-16 interfaces or that need to interoperate
+ with UTF-32 or UTF-16 components.
+
+
Lexers added for SAS and Stata.
Feature #1185.
diff --git a/include/Scintilla.h b/include/Scintilla.h
index db4524f12..ccbeef99e 100644
--- a/include/Scintilla.h
+++ b/include/Scintilla.h
@@ -365,6 +365,7 @@ typedef sptr_t (*SciFnDirect)(sptr_t ptr, unsigned int iMessage, uptr_t wParam,
#define SCI_GETLINEINDENTPOSITION 2128
#define SCI_GETCOLUMN 2129
#define SCI_COUNTCHARACTERS 2633
+#define SCI_COUNTCODEUNITS 2715
#define SCI_SETHSCROLLBAR 2130
#define SCI_GETHSCROLLBAR 2131
#define SC_IV_NONE 0
@@ -753,6 +754,7 @@ typedef sptr_t (*SciFnDirect)(sptr_t ptr, unsigned int iMessage, uptr_t wParam,
#define SCI_POSITIONBEFORE 2417
#define SCI_POSITIONAFTER 2418
#define SCI_POSITIONRELATIVE 2670
+#define SCI_POSITIONRELATIVECODEUNITS 2716
#define SCI_COPYRANGE 2419
#define SCI_COPYTEXT 2420
#define SC_SEL_STREAM 0
@@ -1112,6 +1114,14 @@ typedef sptr_t (*SciFnDirect)(sptr_t ptr, unsigned int iMessage, uptr_t wParam,
#define SC_BIDIRECTIONAL_R2L 2
#define SCI_GETBIDIRECTIONAL 2708
#define SCI_SETBIDIRECTIONAL 2709
+#define SC_LINECHARACTERINDEX_NONE 0
+#define SC_LINECHARACTERINDEX_UTF32 1
+#define SC_LINECHARACTERINDEX_UTF16 2
+#define SCI_GETLINECHARACTERINDEX 2710
+#define SCI_ALLOCATELINECHARACTERINDEX 2711
+#define SCI_RELEASELINECHARACTERINDEX 2712
+#define SCI_LINEFROMINDEXPOSITION 2713
+#define SCI_INDEXPOSITIONFROMLINE 2714
#endif
/* --Autogenerated -- end of section automatically generated from Scintilla.iface */
diff --git a/include/Scintilla.iface b/include/Scintilla.iface
index 420a529a0..3719628a2 100644
--- a/include/Scintilla.iface
+++ b/include/Scintilla.iface
@@ -862,6 +862,9 @@ get int GetColumn=2129(position pos,)
# Count characters between two positions.
fun int CountCharacters=2633(position start, position end)
+# Count code units between two positions.
+fun int CountCodeUnits=2715(position start, position end)
+
# Show or hide the horizontal scroll bar.
set void SetHScrollBar=2130(bool visible,)
# Is the horizontal scroll bar visible?
@@ -1959,6 +1962,11 @@ fun position PositionAfter=2418(position pos,)
# of characters. Returned value is always between 0 and last position in document.
fun position PositionRelative=2670(position pos, int relative)
+# Given a valid document position, return a position that differs in a number
+# of UTF-16 code units. Returned value is always between 0 and last position in document.
+# The result may point half way (2 bytes) inside a non-BMP character.
+fun position PositionRelativeCodeUnits=2716(position pos, int relative)
+
# Copy a range of text to the clipboard. Positions are clipped into the document.
fun void CopyRange=2419(position start, position end)
@@ -4937,6 +4945,26 @@ get int GetBidirectional=2708(,)
# Set bidirectional text display state.
set void SetBidirectional=2709(int bidirectional,)
+enu LineCharacterIndexType=SC_LINECHARACTERINDEX_
+val SC_LINECHARACTERINDEX_NONE=0
+val SC_LINECHARACTERINDEX_UTF32=1
+val SC_LINECHARACTERINDEX_UTF16=2
+
+# Retrieve line character index state.
+get int GetLineCharacterIndex=2710(,)
+
+# Request line character index be created or its use count increased.
+fun void AllocateLineCharacterIndex=2711(int lineCharacterIndex,)
+
+# Decrease use count of line character index and remove if 0.
+fun void ReleaseLineCharacterIndex=2712(int lineCharacterIndex,)
+
+# Retrieve the document line containing a position measured in index units.
+fun int LineFromIndexPosition=2713(position posUTF32, int lineCharacterIndex)
+
+# Retrieve the position measured in index units at the start of a document line.
+fun position IndexPositionFromLine=2714(int line, int lineCharacterIndex)
+
cat Deprecated
# Divide each styling byte into lexical class bits (default: 5) and indicator
diff --git a/src/CellBuffer.cxx b/src/CellBuffer.cxx
index e8c385f1f..ffe5fe8b3 100644
--- a/src/CellBuffer.cxx
+++ b/src/CellBuffer.cxx
@@ -7,6 +7,7 @@
#include
#include
+#include
#include
#include
#include
@@ -28,17 +29,53 @@
namespace Scintilla {
+struct CountWidths {
+ // Measures the number of characters in a string divided into those
+ // from the Base Multilingual Plane and those from other planes.
+ Sci::Position countBasePlane;
+ Sci::Position countOtherPlanes;
+ CountWidths(Sci::Position countBasePlane_=0, Sci::Position countOtherPlanes_=0) noexcept :
+ countBasePlane(countBasePlane_),
+ countOtherPlanes(countOtherPlanes_) {
+ }
+ CountWidths operator-() const noexcept {
+ return CountWidths(-countBasePlane , -countOtherPlanes);
+ }
+ Sci::Position WidthUTF32() const noexcept {
+ // All code points take one code unit in UTF-32.
+ return countBasePlane + countOtherPlanes;
+ }
+ Sci::Position WidthUTF16() const noexcept {
+ // UTF-16 takes 2 code units for other planes
+ return countBasePlane + 2 * countOtherPlanes;
+ }
+ void CountChar(int lenChar) noexcept {
+ if (lenChar == 4) {
+ countOtherPlanes++;
+ } else {
+ countBasePlane++;
+ }
+ }
+};
+
class ILineVector {
public:
virtual void Init() = 0;
virtual void SetPerLine(PerLine *pl) = 0;
virtual void InsertText(Sci::Line line, Sci::Position delta) = 0;
virtual void InsertLine(Sci::Line line, Sci::Position position, bool lineStart) = 0;
- virtual void SetLineStart(Sci::Line line, Sci::Position position) = 0;
+ virtual void SetLineStart(Sci::Line line, Sci::Position position) noexcept = 0;
virtual void RemoveLine(Sci::Line line) = 0;
virtual Sci::Line Lines() const noexcept = 0;
virtual Sci::Line LineFromPosition(Sci::Position pos) const noexcept = 0;
virtual Sci::Position LineStart(Sci::Line line) const noexcept = 0;
+ virtual void InsertCharacters(Sci::Line line, CountWidths delta) = 0;
+ virtual void SetLineCharactersWidth(Sci::Line line, CountWidths width) = 0;
+ virtual int LineCharacterIndex() const noexcept = 0;
+ virtual bool AllocateLineCharacterIndex(int lineCharacterIndex, Sci::Line lines) = 0;
+ virtual bool ReleaseLineCharacterIndex(int lineCharacterIndex) = 0;
+ virtual Sci::Position IndexLineStart(Sci::Line line, int lineCharacterIndex) const noexcept = 0;
+ virtual Sci::Line LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const noexcept = 0;
virtual ~ILineVector() {}
};
@@ -46,10 +83,59 @@ public:
using namespace Scintilla;
+template
+class LineStartIndex {
+public:
+ int refCount;
+ Partitioning starts;
+
+ LineStartIndex() : refCount(0), starts(4) {
+ // Minimal initial allocation
+ }
+ // Deleted so LineStartIndex objects can not be copied.
+ LineStartIndex(const LineStartIndex &) = delete;
+ LineStartIndex(LineStartIndex &&) = delete;
+ void operator=(const LineStartIndex &) = delete;
+ void operator=(LineStartIndex &&) = delete;
+ virtual ~LineStartIndex() {
+ starts.DeleteAll();
+ }
+ bool Allocate(Sci::Line lines) {
+ refCount++;
+ Sci::Position length = starts.PositionFromPartition(starts.Partitions());
+ for (Sci::Line line = starts.Partitions(); line < lines; line++) {
+ // Produce an ascending sequence that will be filled in with correct widths later
+ length++;
+ starts.InsertPartition(static_cast(line), static_cast(length));
+ }
+ return refCount == 1;
+ }
+ bool Release() {
+ if (refCount == 1) {
+ starts.DeleteAll();
+ }
+ refCount--;
+ return refCount == 0;
+ }
+ bool Active() const noexcept {
+ return refCount > 0;
+ }
+ Sci::Position LineWidth(Sci::Line line) const noexcept {
+ return starts.PositionFromPartition(static_cast(line) + 1) -
+ starts.PositionFromPartition(static_cast(line));
+ }
+ void SetLineWidth(Sci::Line line, Sci::Position width) {
+ const Sci::Position widthCurrent = LineWidth(line);
+ starts.InsertText(static_cast(line), static_cast(width - widthCurrent));
+ }
+};
+
template
class LineVector : public ILineVector {
Partitioning starts;
PerLine *perLine;
+ LineStartIndex startsUTF16;
+ LineStartIndex startsUTF32;
public:
LineVector() : starts(256), perLine(0) {
Init();
@@ -66,7 +152,9 @@ public:
if (perLine) {
perLine->Init();
}
- }
+ startsUTF32.starts.DeleteAll();
+ startsUTF16.starts.DeleteAll();
+ }
void SetPerLine(PerLine *pl) override {
perLine = pl;
}
@@ -74,18 +162,33 @@ public:
starts.InsertText(static_cast(line), static_cast(delta));
}
void InsertLine(Sci::Line line, Sci::Position position, bool lineStart) override {
- starts.InsertPartition(static_cast(line), static_cast(position));
+ const POS lineAsPos = static_cast(line);
+ starts.InsertPartition(lineAsPos, static_cast(position));
+ if (startsUTF32.Active()) {
+ startsUTF32.starts.InsertPartition(lineAsPos,
+ static_cast(startsUTF32.starts.PositionFromPartition(lineAsPos - 1) + 1));
+ }
+ if (startsUTF16.Active()) {
+ startsUTF16.starts.InsertPartition(lineAsPos,
+ static_cast(startsUTF16.starts.PositionFromPartition(lineAsPos - 1) + 1));
+ }
if (perLine) {
if ((line > 0) && lineStart)
line--;
perLine->InsertLine(line);
}
}
- void SetLineStart(Sci::Line line, Sci::Position position) override {
+ void SetLineStart(Sci::Line line, Sci::Position position) noexcept override {
starts.SetPartitionStartPosition(static_cast(line), static_cast(position));
}
void RemoveLine(Sci::Line line) override {
starts.RemovePartition(static_cast(line));
+ if (startsUTF32.Active()) {
+ startsUTF32.starts.RemovePartition(static_cast(line));
+ }
+ if (startsUTF16.Active()) {
+ startsUTF16.starts.RemovePartition(static_cast(line));
+ }
if (perLine) {
perLine->RemoveLine(line);
}
@@ -99,6 +202,71 @@ public:
Sci::Position LineStart(Sci::Line line) const noexcept override {
return starts.PositionFromPartition(static_cast(line));
}
+ void InsertCharacters(Sci::Line line, CountWidths delta) override {
+ if (startsUTF32.Active()) {
+ startsUTF32.starts.InsertText(static_cast(line), static_cast(delta.WidthUTF32()));
+ }
+ if (startsUTF16.Active()) {
+ startsUTF16.starts.InsertText(static_cast(line), static_cast(delta.WidthUTF16()));
+ }
+ }
+ void SetLineCharactersWidth(Sci::Line line, CountWidths width) override {
+ if (startsUTF32.Active()) {
+ assert(startsUTF32.starts.Partitions() == starts.Partitions());
+ startsUTF32.SetLineWidth(line, width.WidthUTF32());
+ }
+ if (startsUTF16.Active()) {
+ assert(startsUTF16.starts.Partitions() == starts.Partitions());
+ startsUTF16.SetLineWidth(line, width.WidthUTF16());
+ }
+ }
+
+ int LineCharacterIndex() const noexcept override {
+ int retVal = 0;
+ if (startsUTF32.Active()) {
+ retVal |= SC_LINECHARACTERINDEX_UTF32;
+ }
+ if (startsUTF16.Active()) {
+ retVal |= SC_LINECHARACTERINDEX_UTF16;
+ }
+ return retVal;
+ }
+ bool AllocateLineCharacterIndex(int lineCharacterIndex, Sci::Line lines) override {
+ bool changed = false;
+ if ((lineCharacterIndex & SC_LINECHARACTERINDEX_UTF32) != 0) {
+ changed = startsUTF32.Allocate(lines) || changed;
+ assert(startsUTF32.starts.Partitions() == starts.Partitions());
+ }
+ if ((lineCharacterIndex & SC_LINECHARACTERINDEX_UTF16) != 0) {
+ changed = startsUTF16.Allocate(lines) || changed;
+ assert(startsUTF16.starts.Partitions() == starts.Partitions());
+ }
+ return changed;
+ }
+ bool ReleaseLineCharacterIndex(int lineCharacterIndex) override {
+ bool changed = false;
+ if ((lineCharacterIndex & SC_LINECHARACTERINDEX_UTF32) != 0) {
+ changed = startsUTF32.Release() || changed;
+ }
+ if ((lineCharacterIndex & SC_LINECHARACTERINDEX_UTF16) != 0) {
+ changed = startsUTF16.Release() || changed;
+ }
+ return changed;
+ }
+ Sci::Position IndexLineStart(Sci::Line line, int lineCharacterIndex) const noexcept override {
+ if (lineCharacterIndex == SC_LINECHARACTERINDEX_UTF32) {
+ return startsUTF32.starts.PositionFromPartition(static_cast(line));
+ } else {
+ return startsUTF16.starts.PositionFromPartition(static_cast(line));
+ }
+ }
+ Sci::Line LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const noexcept override {
+ if (lineCharacterIndex == SC_LINECHARACTERINDEX_UTF32) {
+ return static_cast(startsUTF32.starts.PartitionFromPosition(static_cast(pos)));
+ } else {
+ return static_cast(startsUTF16.starts.PartitionFromPosition(static_cast(pos)));
+ }
+ }
};
Action::Action() {
@@ -364,6 +532,7 @@ void UndoHistory::CompletedRedoStep() {
CellBuffer::CellBuffer(bool hasStyles_, bool largeDocument_) :
hasStyles(hasStyles_), largeDocument(largeDocument_) {
readOnly = false;
+ utf8Substance = false;
utf8LineEnds = 0;
collectingUndo = true;
if (largeDocument)
@@ -505,10 +674,19 @@ void CellBuffer::Allocate(Sci::Position newSize) {
}
}
+void CellBuffer::SetUTF8Substance(bool utf8Substance_) {
+ if (utf8Substance != utf8Substance_) {
+ utf8Substance = utf8Substance_;
+ ResetLineEnds();
+ }
+}
+
void CellBuffer::SetLineEndTypes(int utf8LineEnds_) {
if (utf8LineEnds != utf8LineEnds_) {
+ const int indexes = plv->LineCharacterIndex();
utf8LineEnds = utf8LineEnds_;
ResetLineEnds();
+ AllocateLineCharacterIndex(indexes);
}
}
@@ -535,6 +713,23 @@ void CellBuffer::SetPerLine(PerLine *pl) {
plv->SetPerLine(pl);
}
+int CellBuffer::LineCharacterIndex() const noexcept {
+ return plv->LineCharacterIndex();
+}
+
+void CellBuffer::AllocateLineCharacterIndex(int lineCharacterIndex) {
+ if (utf8Substance) {
+ if (plv->AllocateLineCharacterIndex(lineCharacterIndex, Lines())) {
+ // Changed so recalculate whole file
+ RecalculateIndexLineStarts(0, Lines() - 1);
+ }
+ }
+}
+
+void CellBuffer::ReleaseLineCharacterIndex(int lineCharacterIndex) {
+ plv->ReleaseLineCharacterIndex(lineCharacterIndex);
+}
+
Sci::Line CellBuffer::Lines() const noexcept {
return plv->Lines();
}
@@ -552,6 +747,14 @@ Sci::Line CellBuffer::LineFromPosition(Sci::Position pos) const noexcept {
return plv->LineFromPosition(pos);
}
+Sci::Position CellBuffer::IndexLineStart(Sci::Line line, int lineCharacterIndex) const noexcept {
+ return plv->IndexLineStart(line, lineCharacterIndex);
+}
+
+Sci::Line CellBuffer::LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const noexcept {
+ return plv->LineFromPositionIndex(pos, lineCharacterIndex);
+}
+
bool CellBuffer::IsReadOnly() const {
return readOnly;
}
@@ -612,6 +815,37 @@ bool CellBuffer::UTF8LineEndOverlaps(Sci::Position position) const {
return UTF8IsSeparator(bytes) || UTF8IsSeparator(bytes+1) || UTF8IsNEL(bytes+1);
}
+bool CellBuffer::UTF8IsCharacterBoundary(Sci::Position position) const {
+ assert(position >= 0 && position <= Length());
+ if (position > 0) {
+ std::string back;
+ for (int i = 0; i < UTF8MaxBytes; i++) {
+ const Sci::Position posBack = position - i;
+ if (posBack < 0) {
+ return false;
+ }
+ back.insert(0, 1, substance.ValueAt(posBack));
+ if (!UTF8IsTrailByte(back.front())) {
+ if (i > 0) {
+ // Have reached a non-trail
+ const int cla = UTF8Classify(back);
+ if ((cla & UTF8MaskInvalid) || (cla != i)) {
+ return false;
+ }
+ }
+ break;
+ }
+ }
+ }
+ if (position < Length()) {
+ const unsigned char fore = substance.ValueAt(position);
+ if (UTF8IsTrailByte(fore)) {
+ return false;
+ }
+ }
+ return true;
+}
+
void CellBuffer::ResetLineEnds() {
// Reinitialize line data -- too much work to preserve
plv->Init();
@@ -648,6 +882,38 @@ void CellBuffer::ResetLineEnds() {
}
}
+namespace {
+
+CountWidths CountCharacterWidthsUTF8(std::string_view sv) noexcept {
+ CountWidths cw;
+ size_t remaining = sv.length();
+ while (remaining > 0) {
+ const int utf8Status = UTF8Classify(sv);
+ const int lenChar = utf8Status & UTF8MaskWidth;
+ cw.CountChar(lenChar);
+ sv.remove_prefix(lenChar);
+ remaining -= lenChar;
+ }
+ return cw;
+}
+
+}
+
+void CellBuffer::RecalculateIndexLineStarts(Sci::Line lineFirst, Sci::Line lineLast) {
+ std::string text;
+ Sci::Position posLineEnd = LineStart(lineFirst);
+ for (Sci::Line line = lineFirst; line <= lineLast; line++) {
+ // Find line start and end, retrieve text of line, count characters and update line width
+ const Sci::Position posLineStart = posLineEnd;
+ posLineEnd = LineStart(line+1);
+ const Sci::Position width = posLineEnd - posLineStart;
+ text.resize(width);
+ GetCharRange(text.data(), posLineStart, width);
+ const CountWidths cw = CountCharacterWidthsUTF8(text);
+ plv->SetLineCharactersWidth(line, cw);
+ }
+}
+
void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::Position insertLength) {
if (insertLength == 0)
return;
@@ -659,12 +925,25 @@ void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::P
breakingUTF8LineEnd = UTF8LineEndOverlaps(position);
}
+ const Sci::Line linePosition = plv->LineFromPosition(position);
+ Sci::Line lineInsert = linePosition + 1;
+
+ // A simple insertion is one that inserts valid text on a single line at a character boundary
+ bool simpleInsertion = false;
+
+ // Check for breaking apart a UTF-8 sequence and inserting invalid UTF-8
+ if (utf8Substance && (plv->LineCharacterIndex() != SC_LINECHARACTERINDEX_NONE)) {
+ // Actually, don't need to check that whole insertion is valid just that there
+ // are no potential fragments at ends.
+ simpleInsertion = UTF8IsCharacterBoundary(position) &&
+ UTF8IsValid(std::string_view(s, insertLength));
+ }
+
substance.InsertFromArray(position, s, 0, insertLength);
if (hasStyles) {
style.InsertValue(position, insertLength, 0);
}
- Sci::Line lineInsert = plv->LineFromPosition(position) + 1;
const bool atLineStart = plv->LineStart(lineInsert-1) == position;
// Point all the lines after the insertion point further along in the buffer
plv->InsertText(lineInsert-1, insertLength);
@@ -684,6 +963,7 @@ void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::P
if (ch == '\r') {
InsertLine(lineInsert, (position + i) + 1, atLineStart);
lineInsert++;
+ simpleInsertion = false;
} else if (ch == '\n') {
if (chPrev == '\r') {
// Patch up what was end of line
@@ -692,11 +972,13 @@ void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::P
InsertLine(lineInsert, (position + i) + 1, atLineStart);
lineInsert++;
}
+ simpleInsertion = false;
} else if (utf8LineEnds) {
const unsigned char back3[3] = {chBeforePrev, chPrev, ch};
if (UTF8IsSeparator(back3) || UTF8IsNEL(back3+1)) {
InsertLine(lineInsert, (position + i) + 1, atLineStart);
lineInsert++;
+ simpleInsertion = false;
}
}
chBeforePrev = chPrev;
@@ -707,6 +989,7 @@ void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::P
if (ch == '\r') {
// End of line already in buffer so drop the newly created one
RemoveLine(lineInsert - 1);
+ simpleInsertion = false;
}
} else if (utf8LineEnds && !UTF8IsAscii(chAfter)) {
// May have end of UTF-8 line end in buffer and start in insertion
@@ -716,21 +999,31 @@ void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::P
if (UTF8IsSeparator(back3)) {
InsertLine(lineInsert, (position + insertLength + j) + 1, atLineStart);
lineInsert++;
+ simpleInsertion = false;
}
if ((j == 0) && UTF8IsNEL(back3+1)) {
InsertLine(lineInsert, (position + insertLength + j) + 1, atLineStart);
lineInsert++;
+ simpleInsertion = false;
}
chBeforePrev = chPrev;
chPrev = chAt;
}
}
+ if (simpleInsertion) {
+ const CountWidths cw = CountCharacterWidthsUTF8(std::string_view(s, insertLength));
+ plv->InsertCharacters(linePosition, cw);
+ } else {
+ RecalculateIndexLineStarts(linePosition, lineInsert - 1);
+ }
}
void CellBuffer::BasicDeleteChars(Sci::Position position, Sci::Position deleteLength) {
if (deleteLength == 0)
return;
+ Sci::Line lineRecalculateStart = INVALID_POSITION;
+
if ((position == 0) && (deleteLength == substance.Length())) {
// If whole buffer is being deleted, faster to reinitialise lines data
// than to delete each line.
@@ -739,11 +1032,37 @@ void CellBuffer::BasicDeleteChars(Sci::Position position, Sci::Position deleteLe
// Have to fix up line positions before doing deletion as looking at text in buffer
// to work out which lines have been removed
- Sci::Line lineRemove = plv->LineFromPosition(position) + 1;
+ const Sci::Line linePosition = plv->LineFromPosition(position);
+ Sci::Line lineRemove = linePosition + 1;
+
plv->InsertText(lineRemove-1, - (deleteLength));
const unsigned char chPrev = substance.ValueAt(position - 1);
const unsigned char chBefore = chPrev;
unsigned char chNext = substance.ValueAt(position);
+
+ // Check for breaking apart a UTF-8 sequence
+ // Needs further checks that text is UTF-8 or that some other break apart is occurring
+ if (utf8Substance && (plv->LineCharacterIndex() != SC_LINECHARACTERINDEX_NONE)) {
+ const Sci::Position posEnd = position + deleteLength;
+ const Sci::Line lineEndRemove = plv->LineFromPosition(posEnd);
+ const bool simpleDeletion =
+ (linePosition == lineEndRemove) &&
+ UTF8IsCharacterBoundary(position) && UTF8IsCharacterBoundary(posEnd);
+ if (simpleDeletion) {
+ std::string text(deleteLength, '\0');
+ GetCharRange(text.data(), position, deleteLength);
+ if (UTF8IsValid(text)) {
+ // Everything is good
+ const CountWidths cw = CountCharacterWidthsUTF8(text);
+ plv->InsertCharacters(linePosition, -cw);
+ } else {
+ lineRecalculateStart = linePosition;
+ }
+ } else {
+ lineRecalculateStart = linePosition;
+ }
+ }
+
bool ignoreNL = false;
if (chPrev == '\r' && chNext == '\n') {
// Move back one
@@ -792,6 +1111,9 @@ void CellBuffer::BasicDeleteChars(Sci::Position position, Sci::Position deleteLe
}
}
substance.DeleteRange(position, deleteLength);
+ if (lineRecalculateStart >= 0) {
+ RecalculateIndexLineStarts(lineRecalculateStart, lineRecalculateStart);
+ }
if (hasStyles) {
style.DeleteRange(position, deleteLength);
}
diff --git a/src/CellBuffer.h b/src/CellBuffer.h
index f360b2a23..b9f2406f1 100644
--- a/src/CellBuffer.h
+++ b/src/CellBuffer.h
@@ -113,6 +113,7 @@ private:
SplitVector substance;
SplitVector style;
bool readOnly;
+ bool utf8Substance;
int utf8LineEnds;
bool collectingUndo;
@@ -121,7 +122,9 @@ private:
std::unique_ptr plv;
bool UTF8LineEndOverlaps(Sci::Position position) const;
+ bool UTF8IsCharacterBoundary(Sci::Position position) const;
void ResetLineEnds();
+ void RecalculateIndexLineStarts(Sci::Line lineFirst, Sci::Line lineLast);
/// Actions without undo
void BasicInsertString(Sci::Position position, const char *s, Sci::Position insertLength);
void BasicDeleteChars(Sci::Position position, Sci::Position deleteLength);
@@ -148,13 +151,19 @@ public:
Sci::Position Length() const noexcept;
void Allocate(Sci::Position newSize);
+ void SetUTF8Substance(bool utf8Substance_);
int GetLineEndTypes() const { return utf8LineEnds; }
void SetLineEndTypes(int utf8LineEnds_);
bool ContainsLineEnd(const char *s, Sci::Position length) const;
void SetPerLine(PerLine *pl);
+ int LineCharacterIndex() const noexcept;
+ void AllocateLineCharacterIndex(int lineCharacterIndex);
+ void ReleaseLineCharacterIndex(int lineCharacterIndex);
Sci::Line Lines() const noexcept;
Sci::Position LineStart(Sci::Line line) const noexcept;
+ Sci::Position IndexLineStart(Sci::Line line, int lineCharacterIndex) const noexcept;
Sci::Line LineFromPosition(Sci::Position pos) const noexcept;
+ Sci::Line LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const noexcept;
void InsertLine(Sci::Line line, Sci::Position position, bool lineStart);
void RemoveLine(Sci::Line line);
const char *InsertString(Sci::Position position, const char *s, Sci::Position insertLength, bool &startSequence);
diff --git a/src/Document.cxx b/src/Document.cxx
index f3d8557ac..e53663f3e 100644
--- a/src/Document.cxx
+++ b/src/Document.cxx
@@ -119,6 +119,7 @@ Document::Document(int options) :
decorations = DecorationListCreate(IsLarge());
cb.SetPerLine(this);
+ cb.SetUTF8Substance(SC_CP_UTF8 == dbcsCodePage);
}
Document::~Document() {
@@ -194,6 +195,7 @@ bool Document::SetDBCSCodePage(int dbcsCodePage_) {
dbcsCodePage = dbcsCodePage_;
SetCaseFolder(nullptr);
cb.SetLineEndTypes(lineEndBitSet & LineEndTypesSupported());
+ cb.SetUTF8Substance(SC_CP_UTF8 == dbcsCodePage);
return true;
} else {
return false;
@@ -420,6 +422,14 @@ Sci::Position Document::VCHomePosition(Sci::Position position) const {
return startText;
}
+Sci::Position Document::IndexLineStart(Sci::Line line, int lineCharacterIndex) const {
+ return cb.IndexLineStart(line, lineCharacterIndex);
+}
+
+Sci::Line Document::LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const {
+ return cb.LineFromPositionIndex(pos, lineCharacterIndex);
+}
+
int SCI_METHOD Document::SetLevel(Sci_Position line, int level) {
const int prev = Levels()->SetLevel(static_cast(line), level, LinesTotal());
if (prev != level) {
@@ -2105,6 +2115,18 @@ const char *Document::SubstituteByPosition(const char *text, Sci::Position *leng
return 0;
}
+int Document::LineCharacterIndex() const {
+ return cb.LineCharacterIndex();
+}
+
+void Document::AllocateLineCharacterIndex(int lineCharacterIndex) {
+ return cb.AllocateLineCharacterIndex(lineCharacterIndex);
+}
+
+void Document::ReleaseLineCharacterIndex(int lineCharacterIndex) {
+ return cb.ReleaseLineCharacterIndex(lineCharacterIndex);
+}
+
Sci::Line Document::LinesTotal() const noexcept {
return cb.Lines();
}
diff --git a/src/Document.h b/src/Document.h
index e1613cb20..0ef967e09 100644
--- a/src/Document.h
+++ b/src/Document.h
@@ -389,6 +389,8 @@ public:
bool IsLineEndPosition(Sci::Position position) const;
bool IsPositionInLineEnd(Sci::Position position) const;
Sci::Position VCHomePosition(Sci::Position position) const;
+ Sci::Position IndexLineStart(Sci::Line line, int lineCharacterIndex) const;
+ Sci::Line LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const;
int SCI_METHOD SetLevel(Sci_Position line, int level) override;
int SCI_METHOD GetLevel(Sci_Position line) const override;
@@ -414,6 +416,9 @@ public:
void SetCaseFolder(CaseFolder *pcf_);
Sci::Position FindText(Sci::Position minPos, Sci::Position maxPos, const char *search, int flags, Sci::Position *length);
const char *SubstituteByPosition(const char *text, Sci::Position *length);
+ int LineCharacterIndex() const;
+ void AllocateLineCharacterIndex(int lineCharacterIndex);
+ void ReleaseLineCharacterIndex(int lineCharacterIndex);
Sci::Line LinesTotal() const noexcept;
void SetDefaultCharClasses(bool includeWordClass);
diff --git a/src/Editor.cxx b/src/Editor.cxx
index 86c0536a1..3093e6c57 100644
--- a/src/Editor.cxx
+++ b/src/Editor.cxx
@@ -6020,6 +6020,11 @@ sptr_t Editor::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam) {
static_cast(wParam), lParam),
0, pdoc->Length());
+ case SCI_POSITIONRELATIVECODEUNITS:
+ return std::clamp(pdoc->GetRelativePositionUTF16(
+ static_cast(wParam), lParam),
+ 0, pdoc->Length());
+
case SCI_LINESCROLL:
ScrollTo(topLine + static_cast(lParam));
HorizontalScrollTo(xOffset + static_cast(wParam) * static_cast(vs.spaceWidth));
@@ -6785,6 +6790,23 @@ sptr_t Editor::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam) {
case SCI_GETBIDIRECTIONAL:
return static_cast(bidirectional);
+ case SCI_GETLINECHARACTERINDEX:
+ return pdoc->LineCharacterIndex();
+
+ case SCI_ALLOCATELINECHARACTERINDEX:
+ pdoc->AllocateLineCharacterIndex(static_cast(wParam));
+ break;
+
+ case SCI_RELEASELINECHARACTERINDEX:
+ pdoc->ReleaseLineCharacterIndex(static_cast(wParam));
+ break;
+
+ case SCI_LINEFROMINDEXPOSITION:
+ return pdoc->LineFromPositionIndex(static_cast(wParam), static_cast(lParam));
+
+ case SCI_INDEXPOSITIONFROMLINE:
+ return pdoc->IndexLineStart(static_cast(wParam), static_cast(lParam));
+
// Marker definition and setting
case SCI_MARKERDEFINE:
if (wParam <= MARKER_MAX) {
@@ -8190,6 +8212,10 @@ sptr_t Editor::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam) {
case SCI_COUNTCHARACTERS:
return pdoc->CountCharacters(static_cast(wParam), lParam);
+ //return pdoc->CountCharacters(static_cast(wParam), static_cast(lParam));
+
+ case SCI_COUNTCODEUNITS:
+ return pdoc->CountUTF16(static_cast(wParam), lParam);
default:
return DefWndProc(iMessage, wParam, lParam);
diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx
index 3b7472638..58475687b 100644
--- a/src/UniConversion.cxx
+++ b/src/UniConversion.cxx
@@ -340,6 +340,22 @@ int UTF8DrawBytes(const unsigned char *us, int len) noexcept {
return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
}
+bool UTF8IsValid(std::string_view sv) noexcept {
+ const unsigned char *us = reinterpret_cast(sv.data());
+ size_t remaining = sv.length();
+ while (remaining > 0) {
+ const int utf8Status = UTF8Classify(us, remaining);
+ if (utf8Status & UTF8MaskInvalid) {
+ return false;
+ } else {
+ const int lenChar = utf8Status & UTF8MaskWidth;
+ us += lenChar;
+ remaining -= lenChar;
+ }
+ }
+ return remaining == 0;
+}
+
// Replace invalid bytes in UTF-8 with the replacement character
std::string FixInvalidUTF8(const std::string &text) {
std::string result;
diff --git a/src/UniConversion.h b/src/UniConversion.h
index 6d257cd8e..c676230da 100644
--- a/src/UniConversion.h
+++ b/src/UniConversion.h
@@ -22,6 +22,7 @@ size_t UTF16Length(std::string_view sv);
size_t UTF16FromUTF8(std::string_view sv, wchar_t *tbuf, size_t tlen);
size_t UTF32FromUTF8(std::string_view sv, unsigned int *tbuf, size_t tlen);
unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept;
+bool UTF8IsValid(std::string_view sv) noexcept;
std::string FixInvalidUTF8(const std::string &text);
extern const unsigned char UTF8BytesOfLead[256];
@@ -49,6 +50,9 @@ inline constexpr bool UTF8IsAscii(int ch) noexcept {
enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 };
int UTF8Classify(const unsigned char *us, size_t len) noexcept;
+inline int UTF8Classify(std::string_view sv) noexcept {
+ return UTF8Classify(reinterpret_cast(sv.data()), sv.length());
+}
// Similar to UTF8Classify but returns a length of 1 for invalid bytes
// instead of setting the invalid flag
diff --git a/test/simpleTests.py b/test/simpleTests.py
index 3ff283dad..b1e8efdb7 100644
--- a/test/simpleTests.py
+++ b/test/simpleTests.py
@@ -1631,6 +1631,76 @@ class TestStyleAttributes(unittest.TestCase):
self.ed.StyleSetHotSpot(self.ed.STYLE_DEFAULT, 1)
self.assertEquals(self.ed.StyleGetHotSpot(self.ed.STYLE_DEFAULT), 1)
+class TestIndices(unittest.TestCase):
+ def setUp(self):
+ self.xite = Xite.xiteFrame
+ self.ed = self.xite.ed
+ self.ed.ClearAll()
+ self.ed.EmptyUndoBuffer()
+ self.ed.SetCodePage(65001)
+ # Text includes one non-BMP character
+ t = "aå\U00010348flﬔ-\n"
+ self.tv = t.encode("UTF-8")
+
+ def tearDown(self):
+ self.ed.SetCodePage(0)
+
+ def testAllocation(self):
+ self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE)
+ self.ed.AllocateLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF32)
+ self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_UTF32)
+ self.ed.ReleaseLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF32)
+ self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE)
+
+ def testUTF32(self):
+ self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE)
+ self.ed.SetContents(self.tv)
+ self.ed.AllocateLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF32)
+ self.assertEquals(self.ed.IndexPositionFromLine(0, self.ed.SC_LINECHARACTERINDEX_UTF32), 0)
+ self.assertEquals(self.ed.IndexPositionFromLine(1, self.ed.SC_LINECHARACTERINDEX_UTF32), 7)
+ self.ed.ReleaseLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF32)
+ self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE)
+
+ def testUTF16(self):
+ self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE)
+ t = "aå\U00010348flﬔ-"
+ tv = t.encode("UTF-8")
+ self.ed.SetContents(self.tv)
+ self.ed.AllocateLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF16)
+ self.assertEquals(self.ed.IndexPositionFromLine(0, self.ed.SC_LINECHARACTERINDEX_UTF16), 0)
+ self.assertEquals(self.ed.IndexPositionFromLine(1, self.ed.SC_LINECHARACTERINDEX_UTF16), 8)
+ self.ed.ReleaseLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF16)
+ self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE)
+
+ def testBoth(self):
+ # Set text before turning indices on
+ self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE)
+ self.ed.SetContents(self.tv)
+ self.ed.AllocateLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF32+self.ed.SC_LINECHARACTERINDEX_UTF16)
+ self.assertEquals(self.ed.IndexPositionFromLine(0, self.ed.SC_LINECHARACTERINDEX_UTF32), 0)
+ self.assertEquals(self.ed.IndexPositionFromLine(1, self.ed.SC_LINECHARACTERINDEX_UTF32), 7)
+ self.assertEquals(self.ed.IndexPositionFromLine(0, self.ed.SC_LINECHARACTERINDEX_UTF16), 0)
+ self.assertEquals(self.ed.IndexPositionFromLine(1, self.ed.SC_LINECHARACTERINDEX_UTF16), 8)
+ # Test the inverse: position->line
+ self.assertEquals(self.ed.LineFromIndexPosition(0, self.ed.SC_LINECHARACTERINDEX_UTF32), 0)
+ self.assertEquals(self.ed.LineFromIndexPosition(7, self.ed.SC_LINECHARACTERINDEX_UTF32), 1)
+ self.assertEquals(self.ed.LineFromIndexPosition(0, self.ed.SC_LINECHARACTERINDEX_UTF16), 0)
+ self.assertEquals(self.ed.LineFromIndexPosition(8, self.ed.SC_LINECHARACTERINDEX_UTF16), 1)
+ self.ed.ReleaseLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF32+self.ed.SC_LINECHARACTERINDEX_UTF16)
+ self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE)
+
+ def testMaintenance(self):
+ # Set text after turning indices on
+ self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE)
+ self.ed.AllocateLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF32+self.ed.SC_LINECHARACTERINDEX_UTF16)
+ self.ed.SetContents(self.tv)
+ self.assertEquals(self.ed.IndexPositionFromLine(0, self.ed.SC_LINECHARACTERINDEX_UTF32), 0)
+ self.assertEquals(self.ed.IndexPositionFromLine(1, self.ed.SC_LINECHARACTERINDEX_UTF32), 7)
+ self.assertEquals(self.ed.IndexPositionFromLine(0, self.ed.SC_LINECHARACTERINDEX_UTF16), 0)
+ self.assertEquals(self.ed.IndexPositionFromLine(1, self.ed.SC_LINECHARACTERINDEX_UTF16), 8)
+ self.ed.ReleaseLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF32+self.ed.SC_LINECHARACTERINDEX_UTF16)
+ self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE)
+
class TestCharacterNavigation(unittest.TestCase):
def setUp(self):
self.xite = Xite.xiteFrame
@@ -1677,6 +1747,31 @@ class TestCharacterNavigation(unittest.TestCase):
self.assert_(after < previous)
previous = after
+ def testRelativeNonBOM(self):
+ # \x61 \xF0\x90\x8D\x88 \xef\xac\x82 \xef\xac\x94 \x2d
+ t = "a\U00010348flﬔ-"
+ tv = t.encode("UTF-8")
+ self.ed.SetContents(tv)
+ self.assertEquals(self.ed.PositionRelative(1, 2), 8)
+ self.assertEquals(self.ed.CountCharacters(1, 8), 2)
+ self.assertEquals(self.ed.CountCodeUnits(1, 8), 3)
+ self.assertEquals(self.ed.PositionRelative(8, -2), 1)
+ self.assertEquals(self.ed.PositionRelativeCodeUnits(8, -3), 1)
+ pos = 0
+ previous = 0
+ for i in range(1, len(t)):
+ after = self.ed.PositionRelative(pos, i)
+ self.assert_(after > pos)
+ self.assert_(after > previous)
+ previous = after
+ pos = len(t)
+ previous = pos
+ for i in range(1, len(t)-1):
+ after = self.ed.PositionRelative(pos, -i)
+ self.assert_(after < pos)
+ self.assert_(after <= previous)
+ previous = after
+
def testLineEnd(self):
t = "a\r\nb\nc"
tv = t.encode("UTF-8")
diff --git a/test/unit/testCellBuffer.cxx b/test/unit/testCellBuffer.cxx
index 067fa4bc1..e6e486e58 100644
--- a/test/unit/testCellBuffer.cxx
+++ b/test/unit/testCellBuffer.cxx
@@ -10,6 +10,7 @@
#include "Platform.h"
+#include "Scintilla.h"
#include "Position.h"
#include "SplitVector.h"
#include "Partitioning.h"
@@ -145,3 +146,290 @@ TEST_CASE("CellBuffer") {
}
}
+
+TEST_CASE("CharacterIndex") {
+
+ CellBuffer cb(true, false);
+
+ SECTION("Setup") {
+ REQUIRE(cb.LineCharacterIndex() == SC_LINECHARACTERINDEX_NONE);
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 0);
+ cb.SetUTF8Substance(true);
+
+ cb.AllocateLineCharacterIndex(SC_LINECHARACTERINDEX_UTF16);
+ REQUIRE(cb.LineCharacterIndex() == SC_LINECHARACTERINDEX_UTF16);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 0);
+
+ cb.ReleaseLineCharacterIndex(SC_LINECHARACTERINDEX_UTF16);
+ REQUIRE(cb.LineCharacterIndex() == SC_LINECHARACTERINDEX_NONE);
+ }
+
+ SECTION("Insertion") {
+ cb.SetUTF8Substance(true);
+
+ cb.AllocateLineCharacterIndex(SC_LINECHARACTERINDEX_UTF16 | SC_LINECHARACTERINDEX_UTF32);
+
+ bool startSequence = false;
+ cb.InsertString(0, "a", 1, startSequence);
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 1);
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 1);
+
+ const char *hwair = "\xF0\x90\x8D\x88";
+ cb.InsertString(0, hwair, strlen(hwair), startSequence);
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 3);
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 2);
+ }
+
+ SECTION("Deletion") {
+ cb.SetUTF8Substance(true);
+
+ cb.AllocateLineCharacterIndex(SC_LINECHARACTERINDEX_UTF16 | SC_LINECHARACTERINDEX_UTF32);
+
+ bool startSequence = false;
+ const char *hwair = "a\xF0\x90\x8D\x88z";
+ cb.InsertString(0, hwair, strlen(hwair), startSequence);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 4);
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 3);
+
+ cb.DeleteChars(5, 1, startSequence);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 3);
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 2);
+
+ cb.DeleteChars(1, 4, startSequence);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 1);
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 1);
+ }
+
+ SECTION("Insert Complex") {
+ cb.SetUTF8Substance(true);
+ cb.SetLineEndTypes(1);
+ cb.AllocateLineCharacterIndex(SC_LINECHARACTERINDEX_UTF16 | SC_LINECHARACTERINDEX_UTF32);
+
+ bool startSequence = false;
+ // 3 lines of text containing 8 bytes
+ const char *data = "a\n\xF0\x90\x8D\x88\nz";
+ cb.InsertString(0, data, strlen(data), startSequence);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 2);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 5);
+ REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF16) == 6);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 2);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 4);
+ REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF32) == 5);
+
+ // Insert a new line at end -> "a\n\xF0\x90\x8D\x88\nz\n" 4 lines
+ // Last line empty
+ cb.InsertString(strlen(data), "\n", 1, startSequence);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 2);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 5);
+ REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF16) == 7);
+ REQUIRE(cb.IndexLineStart(4, SC_LINECHARACTERINDEX_UTF16) == 7);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 2);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 4);
+ REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF32) == 6);
+ REQUIRE(cb.IndexLineStart(4, SC_LINECHARACTERINDEX_UTF32) == 6);
+
+ // Insert a new line before end -> "a\n\xF0\x90\x8D\x88\nz\n\n" 5 lines
+ cb.InsertString(strlen(data), "\n", 1, startSequence);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 2);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 5);
+ REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF16) == 7);
+ REQUIRE(cb.IndexLineStart(4, SC_LINECHARACTERINDEX_UTF16) == 8);
+ REQUIRE(cb.IndexLineStart(5, SC_LINECHARACTERINDEX_UTF16) == 8);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 2);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 4);
+ REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF32) == 6);
+ REQUIRE(cb.IndexLineStart(4, SC_LINECHARACTERINDEX_UTF32) == 7);
+ REQUIRE(cb.IndexLineStart(5, SC_LINECHARACTERINDEX_UTF32) == 7);
+
+ // Insert a valid 3-byte UTF-8 character at start ->
+ // "\xE2\x82\xACa\n\xF0\x90\x8D\x88\nz\n\n" 5 lines
+
+ const char *euro = "\xE2\x82\xAC";
+ cb.InsertString(0, euro, strlen(euro), startSequence);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 3);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 6);
+ REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF16) == 8);
+ REQUIRE(cb.IndexLineStart(4, SC_LINECHARACTERINDEX_UTF16) == 9);
+ REQUIRE(cb.IndexLineStart(5, SC_LINECHARACTERINDEX_UTF16) == 9);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 3);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 5);
+ REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF32) == 7);
+ REQUIRE(cb.IndexLineStart(4, SC_LINECHARACTERINDEX_UTF32) == 8);
+ REQUIRE(cb.IndexLineStart(5, SC_LINECHARACTERINDEX_UTF32) == 8);
+
+ // Insert a lone lead byte implying a 3 byte character at start of line 2 ->
+ // "\xE2\x82\xACa\n\EF\xF0\x90\x8D\x88\nz\n\n" 5 lines
+ // Should be treated as a single byte character
+
+ const char *lead = "\xEF";
+ cb.InsertString(5, lead, strlen(lead), startSequence);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 3);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 7);
+ REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF16) == 9);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 3);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 6);
+ REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF32) == 8);
+
+ // Insert an ASCII lead byte inside the 3-byte initial character ->
+ // "\xE2!\x82\xACa\n\EF\xF0\x90\x8D\x88\nz\n\n" 5 lines
+ // It should b treated as a single character and should cause the
+ // byte before and the 2 bytes after also be each treated as singles
+ // so 3 more characters on line 0.
+
+ const char *ascii = "!";
+ cb.InsertString(1, ascii, strlen(ascii), startSequence);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 6);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 10);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 6);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 9);
+
+ // Insert a NEL after the '!' to trigger the utf8 line end case ->
+ // "\xE2!\xC2\x85 \x82\xACa\n \EF\xF0\x90\x8D\x88\n z\n\n" 5 lines
+
+ const char *nel = "\xC2\x85";
+ cb.InsertString(2, nel, strlen(nel), startSequence);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 3);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 7);
+ REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF16) == 11);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 3);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 7);
+ REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF32) == 10);
+ }
+
+ SECTION("Delete Multiple lines") {
+ cb.SetUTF8Substance(true);
+ cb.AllocateLineCharacterIndex(SC_LINECHARACTERINDEX_UTF16 | SC_LINECHARACTERINDEX_UTF32);
+
+ bool startSequence = false;
+ // 3 lines of text containing 8 bytes
+ const char *data = "a\n\xF0\x90\x8D\x88\nz\nc";
+ cb.InsertString(0, data, strlen(data), startSequence);
+
+ // Delete first 2 new lines -> "az\nc"
+ cb.DeleteChars(1, strlen(data) - 4, startSequence);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 3);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 4);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 3);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 4);
+ }
+
+ SECTION("Delete Complex") {
+ cb.SetUTF8Substance(true);
+ cb.AllocateLineCharacterIndex(SC_LINECHARACTERINDEX_UTF16 | SC_LINECHARACTERINDEX_UTF32);
+
+ bool startSequence = false;
+ // 3 lines of text containing 8 bytes
+ const char *data = "a\n\xF0\x90\x8D\x88\nz";
+ cb.InsertString(0, data, strlen(data), startSequence);
+
+ // Delete lead byte from character on line 1 ->
+ // "a\n\x90\x8D\x88\nz"
+ // line 1 becomes 4 single byte characters
+ cb.DeleteChars(2, 1, startSequence);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 2);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 6);
+ REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF16) == 7);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 2);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 6);
+ REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF32) == 7);
+
+ // Delete first new line ->
+ // "a\x90\x8D\x88\nz"
+ // Only 2 lines with line 0 containing 5 single byte characters
+ cb.DeleteChars(1, 1, startSequence);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 5);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 6);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 5);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 6);
+
+ // Restore lead byte from character on line 0 making a 4-byte character ->
+ // "a\xF0\x90\x8D\x88\nz"
+
+ const char *lead4 = "\xF0";
+ cb.InsertString(1, lead4, strlen(lead4), startSequence);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 4);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 5);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 3);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 4);
+ }
+
+ SECTION("Insert separates new line bytes") {
+ cb.SetUTF8Substance(true);
+ cb.AllocateLineCharacterIndex(SC_LINECHARACTERINDEX_UTF16 | SC_LINECHARACTERINDEX_UTF32);
+
+ bool startSequence = false;
+ // 2 lines of text containing 4 bytes
+ const char *data = "a\r\nb";
+ cb.InsertString(0, data, strlen(data), startSequence);
+
+ // 3 lines of text containing 5 bytes ->
+ // "a\r!\nb"
+ const char *ascii = "!";
+ cb.InsertString(2, ascii, strlen(ascii), startSequence);
+
+ REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0);
+ REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 2);
+ REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 4);
+ REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF16) == 5);
+ }
+}
--
cgit v1.2.3