diff options
| -rw-r--r-- | doc/ScintillaDoc.html | 126 | ||||
| -rw-r--r-- | doc/ScintillaHistory.html | 5 | ||||
| -rw-r--r-- | include/Scintilla.h | 10 | ||||
| -rw-r--r-- | include/Scintilla.iface | 28 | ||||
| -rw-r--r-- | src/CellBuffer.cxx | 334 | ||||
| -rw-r--r-- | src/CellBuffer.h | 9 | ||||
| -rw-r--r-- | src/Document.cxx | 22 | ||||
| -rw-r--r-- | src/Document.h | 5 | ||||
| -rw-r--r-- | src/Editor.cxx | 26 | ||||
| -rw-r--r-- | src/UniConversion.cxx | 16 | ||||
| -rw-r--r-- | src/UniConversion.h | 4 | ||||
| -rw-r--r-- | test/simpleTests.py | 95 | ||||
| -rw-r--r-- | test/unit/testCellBuffer.cxx | 288 | 
13 files changed, 923 insertions, 45 deletions
| diff --git a/doc/ScintillaDoc.html b/doc/ScintillaDoc.html index 8a8eddbff..ce0758bc0 100644 --- a/doc/ScintillaDoc.html +++ b/doc/ScintillaDoc.html @@ -270,136 +270,140 @@          <tr>            <td>○ <a class="toc" href="#SelectionAndInformation">Selection and information</a></td> -          <td>○ <a class="toc" href="#MultipleSelectionAndVirtualSpace">Multiple Selection and Virtual Space</a></td> +          <td>○ <a class="toc" href="#ByCharacterOrCodeUnit">By character or UTF-16 code unit</a></td> -          <td>○ <a class="toc" href="#ScrollingAndAutomaticScrolling">Scrolling and automatic +          <td>○ <a class="toc" href="#MultipleSelectionAndVirtualSpace">Multiple Selection and Virtual Space</a></td>            scrolling</a></td>          </tr>          <tr> +          <td>○ <a class="toc" href="#ScrollingAndAutomaticScrolling">Scrolling and automatic +            <td>○ <a class="toc" href="#WhiteSpace">White space</a></td>            <td>○ <a class="toc" href="#Cursor">Cursor</a></td> -          <td>○ <a class="toc" href="#MouseCapture">Mouse capture</a></td> -          </tr>          <tr> +          <td>○ <a class="toc" href="#MouseCapture">Mouse capture</a></td> +            <td>○ <a class="toc" href="#LineEndings">Line endings</a></td>            <td>○ <a class="toc" href="#Words">Words</a></td> -          <td>○ <a class="toc" href="#Styling">Styling</a></td> -          </tr>          <tr> +          <td>○ <a class="toc" href="#Styling">Styling</a></td> +            <td>○ <a class="toc" href="#StyleDefinition">Style definition</a></td>            <td>○ <a class="toc" href="#CaretAndSelectionStyles">Caret, selection, and hotspot styles</a></td> -          <td>○ <a class="toc" href="#CharacterRepresentations">Character representations</a></td> -          </tr>          <tr> +          <td>○ <a class="toc" href="#CharacterRepresentations">Character representations</a></td> +            <td>○ <a class="toc" href="#Margins">Margins</a></td>            <td>○ <a class="toc" href="#Annotations">Annotations</a></td> -          <td>○ <a class="toc" href="#OtherSettings">Other settings</a></td> -          </tr>          <tr> +          <td>○ <a class="toc" href="#OtherSettings">Other settings</a></td> +            <td>○ <a class="toc" href="#BraceHighlighting">Brace highlighting</a></td>            <td>○ <a class="toc" href="#TabsAndIndentationGuides">Tabs and Indentation            Guides</a></td> -          <td>○ <a class="toc" href="#Markers">Markers</a></td> -          </tr>          <tr> +          <td>○ <a class="toc" href="#Markers">Markers</a></td> +            <td>○ <a class="toc" href="#Indicators">Indicators</a></td>            <td>○ <a class="toc" href="#Autocompletion">Autocompletion</a></td> -          <td>○ <a class="toc" href="#UserLists">User lists</a></td> -          </tr>          <tr> +          <td>○ <a class="toc" href="#UserLists">User lists</a></td> +            <td>○ <a class="toc" href="#CallTips">Call tips</a></td>            <td>○ <a class="toc" href="#KeyboardCommands">Keyboard commands</a></td> -          <td>○ <a class="toc" href="#KeyBindings">Key bindings</a></td> -          </tr>          <tr> +          <td>○ <a class="toc" href="#KeyBindings">Key bindings</a></td> +            <td>○ <a class="toc" href="#PopupEditMenu">Popup edit menu</a></td>            <td>○ <a class="toc" href="#MacroRecording">Macro recording</a></td> -          <td>○ <a class="toc" href="#Printing">Printing</a></td> -          </tr>          <tr> +          <td>○ <a class="toc" href="#Printing">Printing</a></td> +            <td>○ <a class="toc" href="#DirectAccess">Direct access</a></td>            <td>○ <a class="toc" href="#MultipleViews">Multiple views</a></td> -          <td>○ <a class="toc" href="#BackgroundLoadSave">Background loading and saving</a></td> -          </tr>          <tr> +          <td>○ <a class="toc" href="#BackgroundLoadSave">Background loading and saving</a></td> +            <td>○ <a class="toc" href="#Folding">Folding</a></td>            <td>○ <a class="toc" href="#LineWrapping">Line wrapping</a></td> -          <td>○ <a class="toc" href="#Zooming">Zooming</a></td> -          </tr>          <tr> +          <td>○ <a class="toc" href="#Zooming">Zooming</a></td> +            <td>○ <a class="toc" href="#LongLines">Long lines</a></td>            <td>○ <a class="toc" href="#Accessibility">Accessibility</a></td> -          <td>○ <a class="toc" href="#Lexer">Lexer</a></td> -          </tr>          <tr> +          <td>○ <a class="toc" href="#Lexer">Lexer</a></td> +            <td>○ <a class="toc" href="#LexerObjects">Lexer objects</a></td>            <td>○ <a class="toc" href="#Notifications">Notifications</a></td> -          <td>○ <a class="toc" href="#Images">Images</a></td> -          </tr>          <tr> +          <td>○ <a class="toc" href="#Images">Images</a></td> +            <td>○ <a class="toc" href="#GTK">GTK+</a></td>            <td>○ <a class="toc" href="#ProvisionalMessages"><span class="provisional">Provisional messages</span></a></td> -          <td>○ <a class="toc" href="#DeprecatedMessages">Deprecated messages</a></td> -          </tr>          <tr> +          <td>○ <a class="toc" href="#DeprecatedMessages">Deprecated messages</a></td> +            <td>○ <a class="toc" href="#EditMessagesNeverSupportedByScintilla">Edit messages never            supported by Scintilla</a></td>            <td>○ <a class="toc" href="#RemovedFeatures">Removed features</a></td> +        </tr> +        <tr>            <td>○ <a class="toc" href="#BuildingScintilla">Building Scintilla</a></td>          </tr> @@ -1229,8 +1233,6 @@ struct Sci_TextToFind {       <a class="message" href="#SCI_MOVECARETINSIDEVIEW">SCI_MOVECARETINSIDEVIEW</a><br />       <a class="message" href="#SCI_POSITIONBEFORE">SCI_POSITIONBEFORE(int pos) → position</a><br />       <a class="message" href="#SCI_POSITIONAFTER">SCI_POSITIONAFTER(int pos) → position</a><br /> -     <a class="message" href="#SCI_POSITIONRELATIVE">SCI_POSITIONRELATIVE(int pos, int relative) → position</a><br /> -     <a class="message" href="#SCI_COUNTCHARACTERS">SCI_COUNTCHARACTERS(int start, int end) → int</a><br />       <a class="message" href="#SCI_TEXTWIDTH">SCI_TEXTWIDTH(int style, const char *text) → int</a><br />       <a class="message" href="#SCI_TEXTHEIGHT">SCI_TEXTHEIGHT(int line) → int</a><br />       <a class="message" href="#SCI_CHOOSECARETX">SCI_CHOOSECARETX</a><br /> @@ -1448,15 +1450,6 @@ struct Sci_TextToFind {       If called with a position within a multi byte character will return the position       of the start/end of that character.</p> -    <p><b id="SCI_POSITIONRELATIVE">SCI_POSITIONRELATIVE(int pos, int relative) → position</b><br /> -     Count a number of whole characters before or after the argument position and return that position. -     The minimum position returned is 0 and the maximum is the last position in the document. -     If the position goes past the document end then 0 is returned. -     </p> - -    <p><b id="SCI_COUNTCHARACTERS">SCI_COUNTCHARACTERS(int start, int end) → int</b><br /> -     Returns the number of whole characters between two positions..</p> -      <p><b id="SCI_TEXTWIDTH">SCI_TEXTWIDTH(int style, const char *text) → int</b><br />       This returns the pixel width of a string drawn in the given <code class="parameter">style</code> which can      be used, for example, to decide how wide to make the line number margin in order to display a @@ -1527,6 +1520,61 @@ struct Sci_TextToFind {       When this option is turned off, mouse selections will always stick to the mode the selection was started in. It       is off by default.</p> +    <h2 id="ByCharacterOrCodeUnit">By character or UTF-16 code unit</h2> + +    <p>Most Scintilla APIs use byte positions but some applications want to use positions based on counting +    (UTF-32) characters or (UTF-16) code units +    or need to communicate with other code written in terms of characters or code units. +    With only byte positions, this may require examining many bytes to count characters or code units in the document +    but this may be sped up in some cases by indexing the line starts by character or code unit.</p> + +    <code> +     <a class="message" href="#SCI_POSITIONRELATIVE">SCI_POSITIONRELATIVE(int pos, int relative) → position</a><br /> +     <a class="message" href="#SCI_POSITIONRELATIVECODEUNITS">SCI_POSITIONRELATIVECODEUNITS(int pos, int relative) → position</a><br /> +     <a class="message" href="#SCI_COUNTCHARACTERS">SCI_COUNTCHARACTERS(int start, int end) → int</a><br /> +     <a class="message" href="#SCI_COUNTCODEUNITS">SCI_COUNTCODEUNITS(int start, int end) → int</a><br /> +     <a class="message" href="#SCI_GETLINECHARACTERINDEX">SCI_GETLINECHARACTERINDEX → int</a><br /> +     <a class="message" href="#SCI_ALLOCATELINECHARACTERINDEX">SCI_ALLOCATELINECHARACTERINDEX(int lineCharacterIndex)</a><br /> +     <a class="message" href="#SCI_RELEASELINECHARACTERINDEX">SCI_RELEASELINECHARACTERINDEX(int lineCharacterIndex)</a><br /> +     <a class="message" href="#SCI_LINEFROMINDEXPOSITION">SCI_LINEFROMINDEXPOSITION(int pos, int lineCharacterIndex) → int</a><br /> +     <a class="message" href="#SCI_INDEXPOSITIONFROMLINE">SCI_INDEXPOSITIONFROMLINE(int line, int lineCharacterIndex) → int</a><br /> +    </code> + +    <p><b id="SCI_POSITIONRELATIVE">SCI_POSITIONRELATIVE(int pos, int relative) → position</b><br /> +     Count a number of whole characters before or after the argument position and return that position. +     The minimum position returned is 0 and the maximum is the last position in the document. +     If the position goes past the document end then 0 is returned. +     </p> + +    <p><b id="SCI_COUNTCHARACTERS">SCI_COUNTCHARACTERS(int start, int end) → int</b><br /> +     Returns the number of whole characters between two positions.</p> + +    <p><b id="SCI_POSITIONRELATIVECODEUNITS">SCI_POSITIONRELATIVECODEUNITS(int pos, int relative) → int</b><br /> +    <b id="SCI_COUNTCODEUNITS">SCI_COUNTCODEUNITS(int start, int end) → int</b><br /> +     These are the UTF-16 versions of <code>SCI_POSITIONRELATIVE</code> and <code>SCI_COUNTCHARACTERS</code> +     working in terms of UTF-16 code units.</p> + +    <p><b id="SCI_GETLINECHARACTERINDEX">SCI_GETLINECHARACTERINDEX → int</b><br /> +     Returns which if any indexes are active. It may be <code>SC_LINECHARACTERINDEX_NONE(0)</code> or one or more +     of <code>SC_LINECHARACTERINDEX_UTF32(1)</code> if whole characters are indexed or +     <code>SC_LINECHARACTERINDEX_UTF16(2)</code> if UTF-16 code units are indexed. +     Character indexes are currently only supported for UTF-8 documents.</p> + +    <p><b id="SCI_ALLOCATELINECHARACTERINDEX">SCI_ALLOCATELINECHARACTERINDEX(int lineCharacterIndex)</b><br /> +    <b id="SCI_RELEASELINECHARACTERINDEX">SCI_RELEASELINECHARACTERINDEX(int lineCharacterIndex)</b><br /> +     Allocate or release one or more indexes using same enumeration as <code>SCI_GETLINECHARACTERINDEX</code>. +     Different aspects of an application may need indexes for different periods and should allocate for those periods. +     Indexes use additional memory so releasing them can help minimize memory but they also take time to recalculate. +     Scintilla may also allocate indexes to support features like accessibility or input method editors. +     Only one index of each type is created for a document at a time.</p> + +    <p><b id="SCI_LINEFROMINDEXPOSITION">SCI_LINEFROMINDEXPOSITION(int pos, int lineCharacterIndex) → int</b><br /> +    <b id="SCI_INDEXPOSITIONFROMLINE">SCI_INDEXPOSITIONFROMLINE(int line, int lineCharacterIndex) → int</b><br /> +     The document line of a particular character or code unit may be found by calling <code>SCI_LINEFROMINDEXPOSITION</code> with one of +     <code>SC_LINECHARACTERINDEX_UTF32(1)</code> or <code>SC_LINECHARACTERINDEX_UTF16(2)</code>. +     The inverse action, finds the starting position of a document line either in characters or code units from the document start by calling +     <code>SCI_INDEXPOSITIONFROMLINE</code> with the same <code class="parameter">lineCharacterIndex</code> argument.</p> +      <h2 id="MultipleSelectionAndVirtualSpace">Multiple Selection and Virtual Space</h2>      <code> diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html index 70895e649..365bb0ed0 100644 --- a/doc/ScintillaHistory.html +++ b/doc/ScintillaHistory.html @@ -550,6 +550,11 @@  	Released 19 June 2018.  	</li>  	<li> +	Optional indexing of line starts in UTF-8 documents by UTF-32 code points and UTF-16 code units added. +	This can improve performance for clients that provide UTF-32 or UTF-16 interfaces or that need to interoperate +	with UTF-32 or UTF-16 components. +	</li> +	<li>  	Lexers added for SAS and Stata.  	<a href="https://sourceforge.net/p/scintilla/feature-requests/1185/">Feature #1185.</a>  	</li> diff --git a/include/Scintilla.h b/include/Scintilla.h index db4524f12..ccbeef99e 100644 --- a/include/Scintilla.h +++ b/include/Scintilla.h @@ -365,6 +365,7 @@ typedef sptr_t (*SciFnDirect)(sptr_t ptr, unsigned int iMessage, uptr_t wParam,  #define SCI_GETLINEINDENTPOSITION 2128  #define SCI_GETCOLUMN 2129  #define SCI_COUNTCHARACTERS 2633 +#define SCI_COUNTCODEUNITS 2715  #define SCI_SETHSCROLLBAR 2130  #define SCI_GETHSCROLLBAR 2131  #define SC_IV_NONE 0 @@ -753,6 +754,7 @@ typedef sptr_t (*SciFnDirect)(sptr_t ptr, unsigned int iMessage, uptr_t wParam,  #define SCI_POSITIONBEFORE 2417  #define SCI_POSITIONAFTER 2418  #define SCI_POSITIONRELATIVE 2670 +#define SCI_POSITIONRELATIVECODEUNITS 2716  #define SCI_COPYRANGE 2419  #define SCI_COPYTEXT 2420  #define SC_SEL_STREAM 0 @@ -1112,6 +1114,14 @@ typedef sptr_t (*SciFnDirect)(sptr_t ptr, unsigned int iMessage, uptr_t wParam,  #define SC_BIDIRECTIONAL_R2L 2  #define SCI_GETBIDIRECTIONAL 2708  #define SCI_SETBIDIRECTIONAL 2709 +#define SC_LINECHARACTERINDEX_NONE 0 +#define SC_LINECHARACTERINDEX_UTF32 1 +#define SC_LINECHARACTERINDEX_UTF16 2 +#define SCI_GETLINECHARACTERINDEX 2710 +#define SCI_ALLOCATELINECHARACTERINDEX 2711 +#define SCI_RELEASELINECHARACTERINDEX 2712 +#define SCI_LINEFROMINDEXPOSITION 2713 +#define SCI_INDEXPOSITIONFROMLINE 2714  #endif  /* --Autogenerated -- end of section automatically generated from Scintilla.iface */ diff --git a/include/Scintilla.iface b/include/Scintilla.iface index 420a529a0..3719628a2 100644 --- a/include/Scintilla.iface +++ b/include/Scintilla.iface @@ -862,6 +862,9 @@ get int GetColumn=2129(position pos,)  # Count characters between two positions.  fun int CountCharacters=2633(position start, position end) +# Count code units between two positions. +fun int CountCodeUnits=2715(position start, position end) +  # Show or hide the horizontal scroll bar.  set void SetHScrollBar=2130(bool visible,)  # Is the horizontal scroll bar visible? @@ -1959,6 +1962,11 @@ fun position PositionAfter=2418(position pos,)  # of characters. Returned value is always between 0 and last position in document.  fun position PositionRelative=2670(position pos, int relative) +# Given a valid document position, return a position that differs in a number +# of UTF-16 code units. Returned value is always between 0 and last position in document. +# The result may point half way (2 bytes) inside a non-BMP character. +fun position PositionRelativeCodeUnits=2716(position pos, int relative) +  # Copy a range of text to the clipboard. Positions are clipped into the document.  fun void CopyRange=2419(position start, position end) @@ -4937,6 +4945,26 @@ get int GetBidirectional=2708(,)  # Set bidirectional text display state.  set void SetBidirectional=2709(int bidirectional,) +enu LineCharacterIndexType=SC_LINECHARACTERINDEX_ +val SC_LINECHARACTERINDEX_NONE=0 +val SC_LINECHARACTERINDEX_UTF32=1 +val SC_LINECHARACTERINDEX_UTF16=2 + +# Retrieve line character index state. +get int GetLineCharacterIndex=2710(,) + +# Request line character index be created or its use count increased. +fun void AllocateLineCharacterIndex=2711(int lineCharacterIndex,) + +# Decrease use count of line character index and remove if 0. +fun void ReleaseLineCharacterIndex=2712(int lineCharacterIndex,) + +# Retrieve the document line containing a position measured in index units. +fun int LineFromIndexPosition=2713(position posUTF32, int lineCharacterIndex) + +# Retrieve the position measured in index units at the start of a document line. +fun position IndexPositionFromLine=2714(int line, int lineCharacterIndex) +  cat Deprecated  # Divide each styling byte into lexical class bits (default: 5) and indicator diff --git a/src/CellBuffer.cxx b/src/CellBuffer.cxx index e8c385f1f..ffe5fe8b3 100644 --- a/src/CellBuffer.cxx +++ b/src/CellBuffer.cxx @@ -7,6 +7,7 @@  #include <cstddef>  #include <cstdlib> +#include <cassert>  #include <cstring>  #include <cstdio>  #include <cstdarg> @@ -28,17 +29,53 @@  namespace Scintilla { +struct CountWidths { +	// Measures the number of characters in a string divided into those +	// from the Base Multilingual Plane and those from other planes. +	Sci::Position countBasePlane; +	Sci::Position countOtherPlanes; +	CountWidths(Sci::Position countBasePlane_=0, Sci::Position countOtherPlanes_=0) noexcept : +		countBasePlane(countBasePlane_), +		countOtherPlanes(countOtherPlanes_) { +	} +	CountWidths operator-() const noexcept { +		return CountWidths(-countBasePlane , -countOtherPlanes); +	} +	Sci::Position WidthUTF32() const noexcept { +		// All code points take one code unit in UTF-32. +		return countBasePlane + countOtherPlanes; +	} +	Sci::Position WidthUTF16() const noexcept { +		// UTF-16 takes 2 code units for other planes +		return countBasePlane + 2 * countOtherPlanes; +	} +	void CountChar(int lenChar) noexcept { +		if (lenChar == 4) { +			countOtherPlanes++; +		} else { +			countBasePlane++; +		} +	} +}; +  class ILineVector {  public:  	virtual void Init() = 0;  	virtual void SetPerLine(PerLine *pl) = 0;  	virtual void InsertText(Sci::Line line, Sci::Position delta) = 0;  	virtual void InsertLine(Sci::Line line, Sci::Position position, bool lineStart) = 0; -	virtual void SetLineStart(Sci::Line line, Sci::Position position) = 0; +	virtual void SetLineStart(Sci::Line line, Sci::Position position) noexcept = 0;  	virtual void RemoveLine(Sci::Line line) = 0;  	virtual Sci::Line Lines() const noexcept = 0;  	virtual Sci::Line LineFromPosition(Sci::Position pos) const noexcept = 0;  	virtual Sci::Position LineStart(Sci::Line line) const noexcept = 0; +	virtual void InsertCharacters(Sci::Line line, CountWidths delta) = 0; +	virtual void SetLineCharactersWidth(Sci::Line line, CountWidths width) = 0; +	virtual int LineCharacterIndex() const noexcept = 0; +	virtual bool AllocateLineCharacterIndex(int lineCharacterIndex, Sci::Line lines) = 0; +	virtual bool ReleaseLineCharacterIndex(int lineCharacterIndex) = 0; +	virtual Sci::Position IndexLineStart(Sci::Line line, int lineCharacterIndex) const noexcept = 0; +	virtual Sci::Line LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const noexcept = 0;  	virtual ~ILineVector() {}  }; @@ -47,9 +84,58 @@ public:  using namespace Scintilla;  template <typename POS> +class LineStartIndex { +public: +	int refCount; +	Partitioning<POS> starts; + +	LineStartIndex() : refCount(0), starts(4) { +		// Minimal initial allocation +	} +	// Deleted so LineStartIndex objects can not be copied. +	LineStartIndex(const LineStartIndex &) = delete; +	LineStartIndex(LineStartIndex &&) = delete; +	void operator=(const LineStartIndex &) = delete; +	void operator=(LineStartIndex &&) = delete; +	virtual ~LineStartIndex() { +		starts.DeleteAll(); +	} +	bool Allocate(Sci::Line lines) { +		refCount++; +		Sci::Position length = starts.PositionFromPartition(starts.Partitions()); +		for (Sci::Line line = starts.Partitions(); line < lines; line++) { +			// Produce an ascending sequence that will be filled in with correct widths later +			length++; +			starts.InsertPartition(static_cast<POS>(line), static_cast<POS>(length)); +		} +		return refCount == 1; +	} +	bool Release() { +		if (refCount == 1) { +			starts.DeleteAll(); +		} +		refCount--; +		return refCount == 0; +	} +	bool Active() const noexcept { +		return refCount > 0; +	} +	Sci::Position LineWidth(Sci::Line line) const noexcept { +		return starts.PositionFromPartition(static_cast<POS>(line) + 1) - +			starts.PositionFromPartition(static_cast<POS>(line)); +	} +	void SetLineWidth(Sci::Line line, Sci::Position width) { +		const Sci::Position widthCurrent = LineWidth(line); +		starts.InsertText(static_cast<POS>(line), static_cast<POS>(width - widthCurrent)); +	} +}; + +template <typename POS>  class LineVector : public ILineVector {  	Partitioning<POS> starts;  	PerLine *perLine; +	LineStartIndex<POS> startsUTF16; +	LineStartIndex<POS> startsUTF32;  public:  	LineVector() : starts(256), perLine(0) {  		Init(); @@ -66,7 +152,9 @@ public:  		if (perLine) {  			perLine->Init();  		} - 	} +		startsUTF32.starts.DeleteAll(); +		startsUTF16.starts.DeleteAll(); +	}  	void SetPerLine(PerLine *pl) override {  		perLine = pl;  	} @@ -74,18 +162,33 @@ public:  		starts.InsertText(static_cast<POS>(line), static_cast<POS>(delta));  	}  	void InsertLine(Sci::Line line, Sci::Position position, bool lineStart) override { -		starts.InsertPartition(static_cast<POS>(line), static_cast<POS>(position)); +		const POS lineAsPos = static_cast<POS>(line); +		starts.InsertPartition(lineAsPos, static_cast<POS>(position)); +		if (startsUTF32.Active()) { +			startsUTF32.starts.InsertPartition(lineAsPos, +				static_cast<POS>(startsUTF32.starts.PositionFromPartition(lineAsPos - 1) + 1)); +		} +		if (startsUTF16.Active()) { +			startsUTF16.starts.InsertPartition(lineAsPos, +				static_cast<POS>(startsUTF16.starts.PositionFromPartition(lineAsPos - 1) + 1)); +		}  		if (perLine) {  			if ((line > 0) && lineStart)  				line--;  			perLine->InsertLine(line);  		}  	} -	void SetLineStart(Sci::Line line, Sci::Position position) override { +	void SetLineStart(Sci::Line line, Sci::Position position) noexcept override {  		starts.SetPartitionStartPosition(static_cast<POS>(line), static_cast<POS>(position));  	}  	void RemoveLine(Sci::Line line) override {  		starts.RemovePartition(static_cast<POS>(line)); +		if (startsUTF32.Active()) { +			startsUTF32.starts.RemovePartition(static_cast<POS>(line)); +		} +		if (startsUTF16.Active()) { +			startsUTF16.starts.RemovePartition(static_cast<POS>(line)); +		}  		if (perLine) {  			perLine->RemoveLine(line);  		} @@ -99,6 +202,71 @@ public:  	Sci::Position LineStart(Sci::Line line) const noexcept override {  		return starts.PositionFromPartition(static_cast<POS>(line));  	} +	void InsertCharacters(Sci::Line line, CountWidths delta) override { +		if (startsUTF32.Active()) { +			startsUTF32.starts.InsertText(static_cast<POS>(line), static_cast<POS>(delta.WidthUTF32())); +		} +		if (startsUTF16.Active()) { +			startsUTF16.starts.InsertText(static_cast<POS>(line), static_cast<POS>(delta.WidthUTF16())); +		} +	} +	void SetLineCharactersWidth(Sci::Line line, CountWidths width) override { +		if (startsUTF32.Active()) { +			assert(startsUTF32.starts.Partitions() == starts.Partitions()); +			startsUTF32.SetLineWidth(line, width.WidthUTF32()); +		} +		if (startsUTF16.Active()) { +			assert(startsUTF16.starts.Partitions() == starts.Partitions()); +			startsUTF16.SetLineWidth(line, width.WidthUTF16()); +		} +	} + +	int LineCharacterIndex() const noexcept override { +		int retVal = 0; +		if (startsUTF32.Active()) { +			retVal |= SC_LINECHARACTERINDEX_UTF32; +		} +		if (startsUTF16.Active()) { +			retVal |= SC_LINECHARACTERINDEX_UTF16; +		} +		return retVal; +	} +	bool AllocateLineCharacterIndex(int lineCharacterIndex, Sci::Line lines) override { +		bool changed = false; +		if ((lineCharacterIndex & SC_LINECHARACTERINDEX_UTF32) != 0) { +			changed = startsUTF32.Allocate(lines) || changed; +			assert(startsUTF32.starts.Partitions() == starts.Partitions()); +		} +		if ((lineCharacterIndex & SC_LINECHARACTERINDEX_UTF16) != 0) { +			changed = startsUTF16.Allocate(lines) || changed; +			assert(startsUTF16.starts.Partitions() == starts.Partitions()); +		} +		return changed; +	} +	bool ReleaseLineCharacterIndex(int lineCharacterIndex) override { +		bool changed = false; +		if ((lineCharacterIndex & SC_LINECHARACTERINDEX_UTF32) != 0) { +			changed = startsUTF32.Release() || changed; +		} +		if ((lineCharacterIndex & SC_LINECHARACTERINDEX_UTF16) != 0) { +			changed = startsUTF16.Release() || changed; +		} +		return changed; +	} +	Sci::Position IndexLineStart(Sci::Line line, int lineCharacterIndex) const noexcept override { +		if (lineCharacterIndex == SC_LINECHARACTERINDEX_UTF32) { +			return startsUTF32.starts.PositionFromPartition(static_cast<POS>(line)); +		} else { +			return startsUTF16.starts.PositionFromPartition(static_cast<POS>(line)); +		} +	} +	Sci::Line LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const noexcept override { +		if (lineCharacterIndex == SC_LINECHARACTERINDEX_UTF32) { +			return static_cast<Sci::Line>(startsUTF32.starts.PartitionFromPosition(static_cast<POS>(pos))); +		} else { +			return static_cast<Sci::Line>(startsUTF16.starts.PartitionFromPosition(static_cast<POS>(pos))); +		} +	}  };  Action::Action() { @@ -364,6 +532,7 @@ void UndoHistory::CompletedRedoStep() {  CellBuffer::CellBuffer(bool hasStyles_, bool largeDocument_) :  	hasStyles(hasStyles_), largeDocument(largeDocument_) {  	readOnly = false; +	utf8Substance = false;  	utf8LineEnds = 0;  	collectingUndo = true;  	if (largeDocument) @@ -505,10 +674,19 @@ void CellBuffer::Allocate(Sci::Position newSize) {  	}  } +void CellBuffer::SetUTF8Substance(bool utf8Substance_) { +	if (utf8Substance != utf8Substance_) { +		utf8Substance = utf8Substance_; +		ResetLineEnds(); +	} +} +  void CellBuffer::SetLineEndTypes(int utf8LineEnds_) {  	if (utf8LineEnds != utf8LineEnds_) { +		const int indexes = plv->LineCharacterIndex();  		utf8LineEnds = utf8LineEnds_;  		ResetLineEnds(); +		AllocateLineCharacterIndex(indexes);  	}  } @@ -535,6 +713,23 @@ void CellBuffer::SetPerLine(PerLine *pl) {  	plv->SetPerLine(pl);  } +int CellBuffer::LineCharacterIndex() const noexcept { +	return plv->LineCharacterIndex(); +} + +void CellBuffer::AllocateLineCharacterIndex(int lineCharacterIndex) { +	if (utf8Substance) { +		if (plv->AllocateLineCharacterIndex(lineCharacterIndex, Lines())) { +			// Changed so recalculate whole file +			RecalculateIndexLineStarts(0, Lines() - 1); +		} +	} +} + +void CellBuffer::ReleaseLineCharacterIndex(int lineCharacterIndex) { +	plv->ReleaseLineCharacterIndex(lineCharacterIndex); +} +  Sci::Line CellBuffer::Lines() const noexcept {  	return plv->Lines();  } @@ -552,6 +747,14 @@ Sci::Line CellBuffer::LineFromPosition(Sci::Position pos) const noexcept {  	return plv->LineFromPosition(pos);  } +Sci::Position CellBuffer::IndexLineStart(Sci::Line line, int lineCharacterIndex) const noexcept { +	return plv->IndexLineStart(line, lineCharacterIndex); +} + +Sci::Line CellBuffer::LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const noexcept { +	return plv->LineFromPositionIndex(pos, lineCharacterIndex); +} +  bool CellBuffer::IsReadOnly() const {  	return readOnly;  } @@ -612,6 +815,37 @@ bool CellBuffer::UTF8LineEndOverlaps(Sci::Position position) const {  	return UTF8IsSeparator(bytes) || UTF8IsSeparator(bytes+1) || UTF8IsNEL(bytes+1);  } +bool CellBuffer::UTF8IsCharacterBoundary(Sci::Position position) const { +	assert(position >= 0 && position <= Length()); +	if (position > 0) { +		std::string back; +		for (int i = 0; i < UTF8MaxBytes; i++) { +			const Sci::Position posBack = position - i; +			if (posBack < 0) { +				return false; +			} +			back.insert(0, 1, substance.ValueAt(posBack)); +			if (!UTF8IsTrailByte(back.front())) { +				if (i > 0) { +					// Have reached a non-trail +					const int cla = UTF8Classify(back); +					if ((cla & UTF8MaskInvalid) || (cla != i)) { +						return false; +					} +				} +				break; +			} +		} +	} +	if (position < Length()) { +		const unsigned char fore = substance.ValueAt(position); +		if (UTF8IsTrailByte(fore)) { +			return false; +		} +	} +	return true; +} +  void CellBuffer::ResetLineEnds() {  	// Reinitialize line data -- too much work to preserve  	plv->Init(); @@ -648,6 +882,38 @@ void CellBuffer::ResetLineEnds() {  	}  } +namespace { + +CountWidths CountCharacterWidthsUTF8(std::string_view sv) noexcept { +	CountWidths cw; +	size_t remaining = sv.length(); +	while (remaining > 0) { +		const int utf8Status = UTF8Classify(sv); +		const int lenChar = utf8Status & UTF8MaskWidth; +		cw.CountChar(lenChar); +		sv.remove_prefix(lenChar); +		remaining -= lenChar; +	} +	return cw; +} + +} + +void CellBuffer::RecalculateIndexLineStarts(Sci::Line lineFirst, Sci::Line lineLast) { +	std::string text; +	Sci::Position posLineEnd = LineStart(lineFirst); +	for (Sci::Line line = lineFirst; line <= lineLast; line++) { +		// Find line start and end, retrieve text of line, count characters and update line width +		const Sci::Position posLineStart = posLineEnd; +		posLineEnd = LineStart(line+1); +		const Sci::Position width = posLineEnd - posLineStart; +		text.resize(width); +		GetCharRange(text.data(), posLineStart, width); +		const CountWidths cw = CountCharacterWidthsUTF8(text); +		plv->SetLineCharactersWidth(line, cw); +	} +} +  void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::Position insertLength) {  	if (insertLength == 0)  		return; @@ -659,12 +925,25 @@ void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::P  		breakingUTF8LineEnd = UTF8LineEndOverlaps(position);  	} +	const Sci::Line linePosition = plv->LineFromPosition(position); +	Sci::Line lineInsert = linePosition + 1; + +	// A simple insertion is one that inserts valid text on a single line at a character boundary +	bool simpleInsertion = false; + +	// Check for breaking apart a UTF-8 sequence and inserting invalid UTF-8 +	if (utf8Substance && (plv->LineCharacterIndex() != SC_LINECHARACTERINDEX_NONE)) { +		// Actually, don't need to check that whole insertion is valid just that there +		// are no potential fragments at ends. +		simpleInsertion = UTF8IsCharacterBoundary(position) && +			UTF8IsValid(std::string_view(s, insertLength)); +	} +  	substance.InsertFromArray(position, s, 0, insertLength);  	if (hasStyles) {  		style.InsertValue(position, insertLength, 0);  	} -	Sci::Line lineInsert = plv->LineFromPosition(position) + 1;  	const bool atLineStart = plv->LineStart(lineInsert-1) == position;  	// Point all the lines after the insertion point further along in the buffer  	plv->InsertText(lineInsert-1, insertLength); @@ -684,6 +963,7 @@ void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::P  		if (ch == '\r') {  			InsertLine(lineInsert, (position + i) + 1, atLineStart);  			lineInsert++; +			simpleInsertion = false;  		} else if (ch == '\n') {  			if (chPrev == '\r') {  				// Patch up what was end of line @@ -692,11 +972,13 @@ void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::P  				InsertLine(lineInsert, (position + i) + 1, atLineStart);  				lineInsert++;  			} +			simpleInsertion = false;  		} else if (utf8LineEnds) {  			const unsigned char back3[3] = {chBeforePrev, chPrev, ch};  			if (UTF8IsSeparator(back3) || UTF8IsNEL(back3+1)) {  				InsertLine(lineInsert, (position + i) + 1, atLineStart);  				lineInsert++; +				simpleInsertion = false;  			}  		}  		chBeforePrev = chPrev; @@ -707,6 +989,7 @@ void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::P  		if (ch == '\r') {  			// End of line already in buffer so drop the newly created one  			RemoveLine(lineInsert - 1); +			simpleInsertion = false;  		}  	} else if (utf8LineEnds && !UTF8IsAscii(chAfter)) {  		// May have end of UTF-8 line end in buffer and start in insertion @@ -716,21 +999,31 @@ void CellBuffer::BasicInsertString(Sci::Position position, const char *s, Sci::P  			if (UTF8IsSeparator(back3)) {  				InsertLine(lineInsert, (position + insertLength + j) + 1, atLineStart);  				lineInsert++; +				simpleInsertion = false;  			}  			if ((j == 0) && UTF8IsNEL(back3+1)) {  				InsertLine(lineInsert, (position + insertLength + j) + 1, atLineStart);  				lineInsert++; +				simpleInsertion = false;  			}  			chBeforePrev = chPrev;  			chPrev = chAt;  		}  	} +	if (simpleInsertion) { +		const CountWidths cw = CountCharacterWidthsUTF8(std::string_view(s, insertLength)); +		plv->InsertCharacters(linePosition, cw); +	} else { +		RecalculateIndexLineStarts(linePosition, lineInsert - 1); +	}  }  void CellBuffer::BasicDeleteChars(Sci::Position position, Sci::Position deleteLength) {  	if (deleteLength == 0)  		return; +	Sci::Line lineRecalculateStart = INVALID_POSITION; +  	if ((position == 0) && (deleteLength == substance.Length())) {  		// If whole buffer is being deleted, faster to reinitialise lines data  		// than to delete each line. @@ -739,11 +1032,37 @@ void CellBuffer::BasicDeleteChars(Sci::Position position, Sci::Position deleteLe  		// Have to fix up line positions before doing deletion as looking at text in buffer  		// to work out which lines have been removed -		Sci::Line lineRemove = plv->LineFromPosition(position) + 1; +		const Sci::Line linePosition = plv->LineFromPosition(position); +		Sci::Line lineRemove = linePosition + 1; +  		plv->InsertText(lineRemove-1, - (deleteLength));  		const unsigned char chPrev = substance.ValueAt(position - 1);  		const unsigned char chBefore = chPrev;  		unsigned char chNext = substance.ValueAt(position); + +		// Check for breaking apart a UTF-8 sequence +		// Needs further checks that text is UTF-8 or that some other break apart is occurring +		if (utf8Substance && (plv->LineCharacterIndex() != SC_LINECHARACTERINDEX_NONE)) { +			const Sci::Position posEnd = position + deleteLength; +			const Sci::Line lineEndRemove = plv->LineFromPosition(posEnd); +			const bool simpleDeletion = +				(linePosition == lineEndRemove) && +				UTF8IsCharacterBoundary(position) && UTF8IsCharacterBoundary(posEnd); +			if (simpleDeletion) { +				std::string text(deleteLength, '\0'); +				GetCharRange(text.data(), position, deleteLength); +				if (UTF8IsValid(text)) { +					// Everything is good +					const CountWidths cw = CountCharacterWidthsUTF8(text); +					plv->InsertCharacters(linePosition, -cw); +				} else { +					lineRecalculateStart = linePosition; +				} +			} else { +				lineRecalculateStart = linePosition; +			} +		} +  		bool ignoreNL = false;  		if (chPrev == '\r' && chNext == '\n') {  			// Move back one @@ -792,6 +1111,9 @@ void CellBuffer::BasicDeleteChars(Sci::Position position, Sci::Position deleteLe  		}  	}  	substance.DeleteRange(position, deleteLength); +	if (lineRecalculateStart >= 0) { +		RecalculateIndexLineStarts(lineRecalculateStart, lineRecalculateStart); +	}  	if (hasStyles) {  		style.DeleteRange(position, deleteLength);  	} diff --git a/src/CellBuffer.h b/src/CellBuffer.h index f360b2a23..b9f2406f1 100644 --- a/src/CellBuffer.h +++ b/src/CellBuffer.h @@ -113,6 +113,7 @@ private:  	SplitVector<char> substance;  	SplitVector<char> style;  	bool readOnly; +	bool utf8Substance;  	int utf8LineEnds;  	bool collectingUndo; @@ -121,7 +122,9 @@ private:  	std::unique_ptr<ILineVector> plv;  	bool UTF8LineEndOverlaps(Sci::Position position) const; +	bool UTF8IsCharacterBoundary(Sci::Position position) const;  	void ResetLineEnds(); +	void RecalculateIndexLineStarts(Sci::Line lineFirst, Sci::Line lineLast);  	/// Actions without undo  	void BasicInsertString(Sci::Position position, const char *s, Sci::Position insertLength);  	void BasicDeleteChars(Sci::Position position, Sci::Position deleteLength); @@ -148,13 +151,19 @@ public:  	Sci::Position Length() const noexcept;  	void Allocate(Sci::Position newSize); +	void SetUTF8Substance(bool utf8Substance_);  	int GetLineEndTypes() const { return utf8LineEnds; }  	void SetLineEndTypes(int utf8LineEnds_);  	bool ContainsLineEnd(const char *s, Sci::Position length) const;  	void SetPerLine(PerLine *pl); +	int LineCharacterIndex() const noexcept; +	void AllocateLineCharacterIndex(int lineCharacterIndex); +	void ReleaseLineCharacterIndex(int lineCharacterIndex);  	Sci::Line Lines() const noexcept;  	Sci::Position LineStart(Sci::Line line) const noexcept; +	Sci::Position IndexLineStart(Sci::Line line, int lineCharacterIndex) const noexcept;  	Sci::Line LineFromPosition(Sci::Position pos) const noexcept; +	Sci::Line LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const noexcept;  	void InsertLine(Sci::Line line, Sci::Position position, bool lineStart);  	void RemoveLine(Sci::Line line);  	const char *InsertString(Sci::Position position, const char *s, Sci::Position insertLength, bool &startSequence); diff --git a/src/Document.cxx b/src/Document.cxx index f3d8557ac..e53663f3e 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -119,6 +119,7 @@ Document::Document(int options) :  	decorations = DecorationListCreate(IsLarge());  	cb.SetPerLine(this); +	cb.SetUTF8Substance(SC_CP_UTF8 == dbcsCodePage);  }  Document::~Document() { @@ -194,6 +195,7 @@ bool Document::SetDBCSCodePage(int dbcsCodePage_) {  		dbcsCodePage = dbcsCodePage_;  		SetCaseFolder(nullptr);  		cb.SetLineEndTypes(lineEndBitSet & LineEndTypesSupported()); +		cb.SetUTF8Substance(SC_CP_UTF8 == dbcsCodePage);  		return true;  	} else {  		return false; @@ -420,6 +422,14 @@ Sci::Position Document::VCHomePosition(Sci::Position position) const {  		return startText;  } +Sci::Position Document::IndexLineStart(Sci::Line line, int lineCharacterIndex) const { +	return cb.IndexLineStart(line, lineCharacterIndex); +} + +Sci::Line Document::LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const { +	return cb.LineFromPositionIndex(pos, lineCharacterIndex); +} +  int SCI_METHOD Document::SetLevel(Sci_Position line, int level) {  	const int prev = Levels()->SetLevel(static_cast<Sci::Line>(line), level, LinesTotal());  	if (prev != level) { @@ -2105,6 +2115,18 @@ const char *Document::SubstituteByPosition(const char *text, Sci::Position *leng  		return 0;  } +int Document::LineCharacterIndex() const { +	return cb.LineCharacterIndex(); +} + +void Document::AllocateLineCharacterIndex(int lineCharacterIndex) { +	return cb.AllocateLineCharacterIndex(lineCharacterIndex); +} + +void Document::ReleaseLineCharacterIndex(int lineCharacterIndex) { +	return cb.ReleaseLineCharacterIndex(lineCharacterIndex); +} +  Sci::Line Document::LinesTotal() const noexcept {  	return cb.Lines();  } diff --git a/src/Document.h b/src/Document.h index e1613cb20..0ef967e09 100644 --- a/src/Document.h +++ b/src/Document.h @@ -389,6 +389,8 @@ public:  	bool IsLineEndPosition(Sci::Position position) const;  	bool IsPositionInLineEnd(Sci::Position position) const;  	Sci::Position VCHomePosition(Sci::Position position) const; +	Sci::Position IndexLineStart(Sci::Line line, int lineCharacterIndex) const; +	Sci::Line LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const;  	int SCI_METHOD SetLevel(Sci_Position line, int level) override;  	int SCI_METHOD GetLevel(Sci_Position line) const override; @@ -414,6 +416,9 @@ public:  	void SetCaseFolder(CaseFolder *pcf_);  	Sci::Position FindText(Sci::Position minPos, Sci::Position maxPos, const char *search, int flags, Sci::Position *length);  	const char *SubstituteByPosition(const char *text, Sci::Position *length); +	int LineCharacterIndex() const; +	void AllocateLineCharacterIndex(int lineCharacterIndex); +	void ReleaseLineCharacterIndex(int lineCharacterIndex);  	Sci::Line LinesTotal() const noexcept;  	void SetDefaultCharClasses(bool includeWordClass); diff --git a/src/Editor.cxx b/src/Editor.cxx index 86c0536a1..3093e6c57 100644 --- a/src/Editor.cxx +++ b/src/Editor.cxx @@ -6020,6 +6020,11 @@ sptr_t Editor::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam) {  			static_cast<Sci::Position>(wParam), lParam),  			0, pdoc->Length()); +	case SCI_POSITIONRELATIVECODEUNITS: +		return std::clamp<Sci::Position>(pdoc->GetRelativePositionUTF16( +			static_cast<Sci::Position>(wParam), lParam), +			0, pdoc->Length()); +  	case SCI_LINESCROLL:  		ScrollTo(topLine + static_cast<Sci::Line>(lParam));  		HorizontalScrollTo(xOffset + static_cast<int>(wParam) * static_cast<int>(vs.spaceWidth)); @@ -6785,6 +6790,23 @@ sptr_t Editor::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam) {  	case SCI_GETBIDIRECTIONAL:  		return static_cast<sptr_t>(bidirectional); +	case SCI_GETLINECHARACTERINDEX: +		return pdoc->LineCharacterIndex(); + +	case SCI_ALLOCATELINECHARACTERINDEX: +		pdoc->AllocateLineCharacterIndex(static_cast<int>(wParam)); +		break; + +	case SCI_RELEASELINECHARACTERINDEX: +		pdoc->ReleaseLineCharacterIndex(static_cast<int>(wParam)); +		break; + +	case SCI_LINEFROMINDEXPOSITION: +		return pdoc->LineFromPositionIndex(static_cast<Sci::Position>(wParam), static_cast<int>(lParam)); + +	case SCI_INDEXPOSITIONFROMLINE: +		return pdoc->IndexLineStart(static_cast<Sci::Line>(wParam), static_cast<int>(lParam)); +  		// Marker definition and setting  	case SCI_MARKERDEFINE:  		if (wParam <= MARKER_MAX) { @@ -8190,6 +8212,10 @@ sptr_t Editor::WndProc(unsigned int iMessage, uptr_t wParam, sptr_t lParam) {  	case SCI_COUNTCHARACTERS:  		return pdoc->CountCharacters(static_cast<Sci::Position>(wParam), lParam); +		//return pdoc->CountCharacters(static_cast<Sci::Position>(wParam), static_cast<Sci::Position>(lParam)); + +	case SCI_COUNTCODEUNITS: +		return pdoc->CountUTF16(static_cast<Sci::Position>(wParam), lParam);  	default:  		return DefWndProc(iMessage, wParam, lParam); diff --git a/src/UniConversion.cxx b/src/UniConversion.cxx index 3b7472638..58475687b 100644 --- a/src/UniConversion.cxx +++ b/src/UniConversion.cxx @@ -340,6 +340,22 @@ int UTF8DrawBytes(const unsigned char *us, int len) noexcept {  	return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);  } +bool UTF8IsValid(std::string_view sv) noexcept { +	const unsigned char *us = reinterpret_cast<const unsigned char *>(sv.data()); +	size_t remaining = sv.length(); +	while (remaining > 0) { +		const int utf8Status = UTF8Classify(us, remaining); +		if (utf8Status & UTF8MaskInvalid) { +			return false; +		} else { +			const int lenChar = utf8Status & UTF8MaskWidth; +			us += lenChar; +			remaining -= lenChar; +		} +	} +	return remaining == 0; +} +  // Replace invalid bytes in UTF-8 with the replacement character  std::string FixInvalidUTF8(const std::string &text) {  	std::string result; diff --git a/src/UniConversion.h b/src/UniConversion.h index 6d257cd8e..c676230da 100644 --- a/src/UniConversion.h +++ b/src/UniConversion.h @@ -22,6 +22,7 @@ size_t UTF16Length(std::string_view sv);  size_t UTF16FromUTF8(std::string_view sv, wchar_t *tbuf, size_t tlen);  size_t UTF32FromUTF8(std::string_view sv, unsigned int *tbuf, size_t tlen);  unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept; +bool UTF8IsValid(std::string_view sv) noexcept;  std::string FixInvalidUTF8(const std::string &text);  extern const unsigned char UTF8BytesOfLead[256]; @@ -49,6 +50,9 @@ inline constexpr bool UTF8IsAscii(int ch) noexcept {  enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 };  int UTF8Classify(const unsigned char *us, size_t len) noexcept; +inline int UTF8Classify(std::string_view sv) noexcept { +	return UTF8Classify(reinterpret_cast<const unsigned char *>(sv.data()), sv.length()); +}  // Similar to UTF8Classify but returns a length of 1 for invalid bytes  // instead of setting the invalid flag diff --git a/test/simpleTests.py b/test/simpleTests.py index 3ff283dad..b1e8efdb7 100644 --- a/test/simpleTests.py +++ b/test/simpleTests.py @@ -1631,6 +1631,76 @@ class TestStyleAttributes(unittest.TestCase):  		self.ed.StyleSetHotSpot(self.ed.STYLE_DEFAULT, 1)  		self.assertEquals(self.ed.StyleGetHotSpot(self.ed.STYLE_DEFAULT), 1) +class TestIndices(unittest.TestCase): +	def setUp(self): +		self.xite = Xite.xiteFrame +		self.ed = self.xite.ed +		self.ed.ClearAll() +		self.ed.EmptyUndoBuffer() +		self.ed.SetCodePage(65001) +		# Text includes one non-BMP character +		t = "aå\U00010348flﬔ-\n" +		self.tv = t.encode("UTF-8") + +	def tearDown(self): +		self.ed.SetCodePage(0) + +	def testAllocation(self): +		self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE) +		self.ed.AllocateLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF32) +		self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_UTF32) +		self.ed.ReleaseLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF32) +		self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE) + +	def testUTF32(self): +		self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE) +		self.ed.SetContents(self.tv) +		self.ed.AllocateLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF32) +		self.assertEquals(self.ed.IndexPositionFromLine(0, self.ed.SC_LINECHARACTERINDEX_UTF32), 0) +		self.assertEquals(self.ed.IndexPositionFromLine(1, self.ed.SC_LINECHARACTERINDEX_UTF32), 7) +		self.ed.ReleaseLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF32) +		self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE) + +	def testUTF16(self): +		self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE) +		t = "aå\U00010348flﬔ-" +		tv = t.encode("UTF-8") +		self.ed.SetContents(self.tv) +		self.ed.AllocateLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF16) +		self.assertEquals(self.ed.IndexPositionFromLine(0, self.ed.SC_LINECHARACTERINDEX_UTF16), 0) +		self.assertEquals(self.ed.IndexPositionFromLine(1, self.ed.SC_LINECHARACTERINDEX_UTF16), 8) +		self.ed.ReleaseLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF16) +		self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE) + +	def testBoth(self): +		# Set text before turning indices on +		self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE) +		self.ed.SetContents(self.tv) +		self.ed.AllocateLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF32+self.ed.SC_LINECHARACTERINDEX_UTF16) +		self.assertEquals(self.ed.IndexPositionFromLine(0, self.ed.SC_LINECHARACTERINDEX_UTF32), 0) +		self.assertEquals(self.ed.IndexPositionFromLine(1, self.ed.SC_LINECHARACTERINDEX_UTF32), 7) +		self.assertEquals(self.ed.IndexPositionFromLine(0, self.ed.SC_LINECHARACTERINDEX_UTF16), 0) +		self.assertEquals(self.ed.IndexPositionFromLine(1, self.ed.SC_LINECHARACTERINDEX_UTF16), 8) +		# Test the inverse: position->line +		self.assertEquals(self.ed.LineFromIndexPosition(0, self.ed.SC_LINECHARACTERINDEX_UTF32), 0) +		self.assertEquals(self.ed.LineFromIndexPosition(7, self.ed.SC_LINECHARACTERINDEX_UTF32), 1) +		self.assertEquals(self.ed.LineFromIndexPosition(0, self.ed.SC_LINECHARACTERINDEX_UTF16), 0) +		self.assertEquals(self.ed.LineFromIndexPosition(8, self.ed.SC_LINECHARACTERINDEX_UTF16), 1) +		self.ed.ReleaseLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF32+self.ed.SC_LINECHARACTERINDEX_UTF16) +		self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE) + +	def testMaintenance(self): +		# Set text after turning indices on +		self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE) +		self.ed.AllocateLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF32+self.ed.SC_LINECHARACTERINDEX_UTF16) +		self.ed.SetContents(self.tv) +		self.assertEquals(self.ed.IndexPositionFromLine(0, self.ed.SC_LINECHARACTERINDEX_UTF32), 0) +		self.assertEquals(self.ed.IndexPositionFromLine(1, self.ed.SC_LINECHARACTERINDEX_UTF32), 7) +		self.assertEquals(self.ed.IndexPositionFromLine(0, self.ed.SC_LINECHARACTERINDEX_UTF16), 0) +		self.assertEquals(self.ed.IndexPositionFromLine(1, self.ed.SC_LINECHARACTERINDEX_UTF16), 8) +		self.ed.ReleaseLineCharacterIndex(self.ed.SC_LINECHARACTERINDEX_UTF32+self.ed.SC_LINECHARACTERINDEX_UTF16) +		self.assertEquals(self.ed.GetLineCharacterIndex(), self.ed.SC_LINECHARACTERINDEX_NONE) +  class TestCharacterNavigation(unittest.TestCase):  	def setUp(self):  		self.xite = Xite.xiteFrame @@ -1677,6 +1747,31 @@ class TestCharacterNavigation(unittest.TestCase):  			self.assert_(after < previous)  			previous = after +	def testRelativeNonBOM(self): +		# \x61  \xF0\x90\x8D\x88  \xef\xac\x82   \xef\xac\x94   \x2d +		t = "a\U00010348flﬔ-" +		tv = t.encode("UTF-8") +		self.ed.SetContents(tv) +		self.assertEquals(self.ed.PositionRelative(1, 2), 8) +		self.assertEquals(self.ed.CountCharacters(1, 8), 2) +		self.assertEquals(self.ed.CountCodeUnits(1, 8), 3) +		self.assertEquals(self.ed.PositionRelative(8, -2), 1) +		self.assertEquals(self.ed.PositionRelativeCodeUnits(8, -3), 1) +		pos = 0 +		previous = 0 +		for i in range(1, len(t)): +			after = self.ed.PositionRelative(pos, i) +			self.assert_(after > pos) +			self.assert_(after > previous) +			previous = after +		pos = len(t) +		previous = pos +		for i in range(1, len(t)-1): +			after = self.ed.PositionRelative(pos, -i) +			self.assert_(after < pos) +			self.assert_(after <= previous) +			previous = after +  	def testLineEnd(self):  		t = "a\r\nb\nc"  		tv = t.encode("UTF-8") diff --git a/test/unit/testCellBuffer.cxx b/test/unit/testCellBuffer.cxx index 067fa4bc1..e6e486e58 100644 --- a/test/unit/testCellBuffer.cxx +++ b/test/unit/testCellBuffer.cxx @@ -10,6 +10,7 @@  #include "Platform.h" +#include "Scintilla.h"  #include "Position.h"  #include "SplitVector.h"  #include "Partitioning.h" @@ -145,3 +146,290 @@ TEST_CASE("CellBuffer") {  	}  } + +TEST_CASE("CharacterIndex") { + +	CellBuffer cb(true, false); + +	SECTION("Setup") { +		REQUIRE(cb.LineCharacterIndex() == SC_LINECHARACTERINDEX_NONE); +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 0); +		cb.SetUTF8Substance(true); + +		cb.AllocateLineCharacterIndex(SC_LINECHARACTERINDEX_UTF16); +		REQUIRE(cb.LineCharacterIndex() == SC_LINECHARACTERINDEX_UTF16); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 0); + +		cb.ReleaseLineCharacterIndex(SC_LINECHARACTERINDEX_UTF16); +		REQUIRE(cb.LineCharacterIndex() == SC_LINECHARACTERINDEX_NONE); +	} + +	SECTION("Insertion") { +		cb.SetUTF8Substance(true); + +		cb.AllocateLineCharacterIndex(SC_LINECHARACTERINDEX_UTF16 | SC_LINECHARACTERINDEX_UTF32); + +		bool startSequence = false; +		cb.InsertString(0, "a", 1, startSequence); +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 1); +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 1); + +		const char *hwair = "\xF0\x90\x8D\x88"; +		cb.InsertString(0, hwair, strlen(hwair), startSequence); +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 3); +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 2); +	} + +	SECTION("Deletion") { +		cb.SetUTF8Substance(true); + +		cb.AllocateLineCharacterIndex(SC_LINECHARACTERINDEX_UTF16 | SC_LINECHARACTERINDEX_UTF32); + +		bool startSequence = false; +		const char *hwair = "a\xF0\x90\x8D\x88z"; +		cb.InsertString(0, hwair, strlen(hwair), startSequence); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 4); +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 3); + +		cb.DeleteChars(5, 1, startSequence); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 3); +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 2); + +		cb.DeleteChars(1, 4, startSequence); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 1); +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 1); +	} + +	SECTION("Insert Complex") { +		cb.SetUTF8Substance(true); +		cb.SetLineEndTypes(1); +		cb.AllocateLineCharacterIndex(SC_LINECHARACTERINDEX_UTF16 | SC_LINECHARACTERINDEX_UTF32); + +		bool startSequence = false; +		// 3 lines of text containing 8 bytes +		const char *data = "a\n\xF0\x90\x8D\x88\nz"; +		cb.InsertString(0, data, strlen(data), startSequence); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 2); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 5); +		REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF16) == 6); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 2); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 4); +		REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF32) == 5); + +		// Insert a new line at end -> "a\n\xF0\x90\x8D\x88\nz\n" 4 lines +		// Last line empty +		cb.InsertString(strlen(data), "\n", 1, startSequence); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 2); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 5); +		REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF16) == 7); +		REQUIRE(cb.IndexLineStart(4, SC_LINECHARACTERINDEX_UTF16) == 7); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 2); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 4); +		REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF32) == 6); +		REQUIRE(cb.IndexLineStart(4, SC_LINECHARACTERINDEX_UTF32) == 6); + +		// Insert a new line before end -> "a\n\xF0\x90\x8D\x88\nz\n\n" 5 lines +		cb.InsertString(strlen(data), "\n", 1, startSequence); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 2); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 5); +		REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF16) == 7); +		REQUIRE(cb.IndexLineStart(4, SC_LINECHARACTERINDEX_UTF16) == 8); +		REQUIRE(cb.IndexLineStart(5, SC_LINECHARACTERINDEX_UTF16) == 8); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 2); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 4); +		REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF32) == 6); +		REQUIRE(cb.IndexLineStart(4, SC_LINECHARACTERINDEX_UTF32) == 7); +		REQUIRE(cb.IndexLineStart(5, SC_LINECHARACTERINDEX_UTF32) == 7); + +		// Insert a valid 3-byte UTF-8 character at start ->  +		// "\xE2\x82\xACa\n\xF0\x90\x8D\x88\nz\n\n" 5 lines + +		const char *euro = "\xE2\x82\xAC"; +		cb.InsertString(0, euro, strlen(euro), startSequence); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 3); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 6); +		REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF16) == 8); +		REQUIRE(cb.IndexLineStart(4, SC_LINECHARACTERINDEX_UTF16) == 9); +		REQUIRE(cb.IndexLineStart(5, SC_LINECHARACTERINDEX_UTF16) == 9); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 3); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 5); +		REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF32) == 7); +		REQUIRE(cb.IndexLineStart(4, SC_LINECHARACTERINDEX_UTF32) == 8); +		REQUIRE(cb.IndexLineStart(5, SC_LINECHARACTERINDEX_UTF32) == 8); + +		// Insert a lone lead byte implying a 3 byte character at start of line 2 ->  +		// "\xE2\x82\xACa\n\EF\xF0\x90\x8D\x88\nz\n\n" 5 lines +		// Should be treated as a single byte character + +		const char *lead = "\xEF"; +		cb.InsertString(5, lead, strlen(lead), startSequence); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 3); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 7); +		REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF16) == 9); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 3); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 6); +		REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF32) == 8); + +		// Insert an ASCII lead byte inside the 3-byte initial character -> +		// "\xE2!\x82\xACa\n\EF\xF0\x90\x8D\x88\nz\n\n" 5 lines +		// It should b treated as a single character and should cause the +		// byte before and the 2 bytes after also be each treated as singles +		// so 3 more characters on line 0. + +		const char *ascii = "!"; +		cb.InsertString(1, ascii, strlen(ascii), startSequence); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 6); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 10); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 6); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 9); + +		// Insert a NEL after the '!' to trigger the utf8 line end case -> +		// "\xE2!\xC2\x85 \x82\xACa\n \EF\xF0\x90\x8D\x88\n z\n\n" 5 lines + +		const char *nel = "\xC2\x85"; +		cb.InsertString(2, nel, strlen(nel), startSequence); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 3); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 7); +		REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF16) == 11); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 3); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 7); +		REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF32) == 10); +	} + +	SECTION("Delete Multiple lines") { +		cb.SetUTF8Substance(true); +		cb.AllocateLineCharacterIndex(SC_LINECHARACTERINDEX_UTF16 | SC_LINECHARACTERINDEX_UTF32); + +		bool startSequence = false; +		// 3 lines of text containing 8 bytes +		const char *data = "a\n\xF0\x90\x8D\x88\nz\nc"; +		cb.InsertString(0, data, strlen(data), startSequence); + +		// Delete first 2 new lines -> "az\nc" +		cb.DeleteChars(1, strlen(data) - 4, startSequence); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 3); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 4); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 3); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 4); +	} + +	SECTION("Delete Complex") { +		cb.SetUTF8Substance(true); +		cb.AllocateLineCharacterIndex(SC_LINECHARACTERINDEX_UTF16 | SC_LINECHARACTERINDEX_UTF32); + +		bool startSequence = false; +		// 3 lines of text containing 8 bytes +		const char *data = "a\n\xF0\x90\x8D\x88\nz"; +		cb.InsertString(0, data, strlen(data), startSequence); + +		// Delete lead byte from character on line 1 -> +		// "a\n\x90\x8D\x88\nz" +		// line 1 becomes 4 single byte characters +		cb.DeleteChars(2, 1, startSequence); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 2); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 6); +		REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF16) == 7); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 2); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 6); +		REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF32) == 7); + +		// Delete first new line -> +		// "a\x90\x8D\x88\nz" +		// Only 2 lines with line 0 containing 5 single byte characters +		cb.DeleteChars(1, 1, startSequence); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 5); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 6); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 5); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 6); + +		// Restore lead byte from character on line 0 making a 4-byte character -> +		// "a\xF0\x90\x8D\x88\nz" + +		const char *lead4 = "\xF0"; +		cb.InsertString(1, lead4, strlen(lead4), startSequence); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 4); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 5); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF32) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF32) == 3); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF32) == 4); +	} + +	SECTION("Insert separates new line bytes") { +		cb.SetUTF8Substance(true); +		cb.AllocateLineCharacterIndex(SC_LINECHARACTERINDEX_UTF16 | SC_LINECHARACTERINDEX_UTF32); + +		bool startSequence = false; +		// 2 lines of text containing 4 bytes +		const char *data = "a\r\nb"; +		cb.InsertString(0, data, strlen(data), startSequence); + +		// 3 lines of text containing 5 bytes -> +		// "a\r!\nb" +		const char *ascii = "!"; +		cb.InsertString(2, ascii, strlen(ascii), startSequence); + +		REQUIRE(cb.IndexLineStart(0, SC_LINECHARACTERINDEX_UTF16) == 0); +		REQUIRE(cb.IndexLineStart(1, SC_LINECHARACTERINDEX_UTF16) == 2); +		REQUIRE(cb.IndexLineStart(2, SC_LINECHARACTERINDEX_UTF16) == 4); +		REQUIRE(cb.IndexLineStart(3, SC_LINECHARACTERINDEX_UTF16) == 5); +	} +} | 
