diff options
| -rw-r--r-- | src/CharacterType.h | 7 | ||||
| -rw-r--r-- | src/Document.cxx | 89 | ||||
| -rw-r--r-- | src/Document.h | 2 | ||||
| -rw-r--r-- | src/PositionCache.cxx | 21 | ||||
| -rw-r--r-- | test/unit/testDocument.cxx | 117 | 
5 files changed, 193 insertions, 43 deletions
| diff --git a/src/CharacterType.h b/src/CharacterType.h index b014f1050..437fb8c5c 100644 --- a/src/CharacterType.h +++ b/src/CharacterType.h @@ -32,6 +32,13 @@ constexpr bool IsEOLCharacter(int ch) noexcept {  	return ch == '\r' || ch == '\n';  } +constexpr bool IsBreakSpace(int ch) noexcept { +	// used for text breaking, treat C0 control character as space. +	// by default C0 control character is handled as special representation, +	// so not appears in normal text. 0x7F DEL is omitted to simplify the code. +	return ch >= 0 && ch <= ' '; +} +  constexpr bool IsADigit(int ch) noexcept {  	return (ch >= '0') && (ch <= '9');  } diff --git a/src/Document.cxx b/src/Document.cxx index 3ebd357df..0d8b00d09 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -1127,47 +1127,74 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {  		&& IsDBCSTrailByteNoExcept(cb.CharAt(pos + 1));  } -// Need to break text into segments near lengthSegment but taking into -// account the encoding to not break inside a UTF-8 or DBCS character -// and also trying to avoid breaking inside a pair of combining characters. +// Need to break text into segments near end but taking into account the +// encoding to not break inside a UTF-8 or DBCS character and also trying +// to avoid breaking inside a pair of combining characters, or inside +// ligatures. +// TODO: implement grapheme cluster boundaries, +// see https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries. +//  // The segment length must always be long enough (more than 4 bytes)  // so that there will be at least one whole character to make a segment.  // For UTF-8, text must consist only of valid whole characters.  // In preference order from best to worst: -//   1) Break after space -//   2) Break before punctuation -//   3) Break after whole character - -int Document::SafeSegment(const char *text, int lengthSegment) const noexcept { -	int lastSpaceBreak = -1; -	int lastPunctuationBreak = -1; -	int lastEncodingAllowedBreak = 0; -	for (int j=0; j < lengthSegment;) { -		const unsigned char ch = text[j]; -		if (j > 0) { -			if (IsSpaceOrTab(text[j - 1]) && !IsSpaceOrTab(text[j])) { -				lastSpaceBreak = j; +//   1) Break before or after spaces or controls +//   2) Break at word and punctuation boundary for better kerning and ligature support +//   3) Break after whole character, this may break combining characters + +size_t Document::SafeSegment(std::string_view text) const noexcept { +	// check space first as most written language use spaces. +	for (std::string_view::iterator it = text.end() - 1; it != text.begin(); --it) { +		if (IsBreakSpace(*it)) { +			return it - text.begin(); +		} +	} + +	if (!dbcsCodePage || dbcsCodePage == CpUtf8) { +		// backward iterate for UTF-8 and single byte encoding to find word and punctuation boundary. +		std::string_view::iterator it = text.end() - 1; +		const bool punctuation = IsPunctuation(*it); +		do { +			--it; +			if (punctuation != IsPunctuation(*it)) { +				return it - text.begin() + 1;  			} -			if (ch < 'A') { -				lastPunctuationBreak = j; +		} while (it != text.begin()); + +		it = text.end() - 1; +		if (dbcsCodePage) { +			// for UTF-8 go back the start of last character. +			for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) { +				--it;  			}  		} -		lastEncodingAllowedBreak = j; +		return it - text.begin(); +	} -		if (dbcsCodePage == CpUtf8) { -			j += UTF8BytesOfLead[ch]; -		} else if (dbcsCodePage) { -			j += IsDBCSLeadByteNoExcept(ch) ? 2 : 1; -		} else { -			j++; +	{ +		// forward iterate for DBCS to find word and punctuation boundary. +		size_t lastPunctuationBreak = 0; +		size_t lastEncodingAllowedBreak = 0; +		CharacterClass ccPrev = CharacterClass::space; +		for (size_t j = 0; j < text.length();) { +			const unsigned char ch = text[j]; +			lastEncodingAllowedBreak = j++; + +			CharacterClass cc = CharacterClass::word; +			if (UTF8IsAscii(ch)) { +				if (IsPunctuation(ch)) { +					cc = CharacterClass::punctuation; +				} +			} else { +				j += IsDBCSLeadByteNoExcept(ch); +			} +			if (cc != ccPrev) { +				ccPrev = cc; +				lastPunctuationBreak = lastEncodingAllowedBreak; +			}  		} +		return lastPunctuationBreak ? lastPunctuationBreak : lastEncodingAllowedBreak;  	} -	if (lastSpaceBreak >= 0) { -		return lastSpaceBreak; -	} else if (lastPunctuationBreak >= 0) { -		return lastPunctuationBreak; -	} -	return lastEncodingAllowedBreak;  }  EncodingFamily Document::CodePageFamily() const noexcept { diff --git a/src/Document.h b/src/Document.h index 897a1270c..e406118a7 100644 --- a/src/Document.h +++ b/src/Document.h @@ -352,7 +352,7 @@ public:  	bool IsDBCSTrailByteNoExcept(char ch) const noexcept;  	int DBCSDrawBytes(std::string_view text) const noexcept;  	bool IsDBCSDualByteAt(Sci::Position pos) const noexcept; -	int SafeSegment(const char *text, int lengthSegment) const noexcept; +	size_t SafeSegment(std::string_view text) const noexcept;  	EncodingFamily CodePageFamily() const noexcept;  	// Gateways to modifying document diff --git a/src/PositionCache.cxx b/src/PositionCache.cxx index 6370edb33..c9f4e8793 100644 --- a/src/PositionCache.cxx +++ b/src/PositionCache.cxx @@ -755,21 +755,20 @@ TextSegment BreakFinder::Next() {  		}  		subBreak = prev;  	} +  	// Splitting up a long run from prev to nextBreak in lots of approximately lengthEachSubdivision. -	// For very long runs add extra breaks after spaces or if no spaces before low punctuation.  	const int startSegment = subBreak; -	if ((nextBreak - subBreak) <= lengthEachSubdivision) { -		subBreak = -1; -		return TextSegment(startSegment, nextBreak - startSegment); +	const int remaining = nextBreak - startSegment; +	int lengthSegment = remaining; +	if (lengthSegment > lengthEachSubdivision) { +		lengthSegment = static_cast<int>(pdoc->SafeSegment(std::string_view(&ll->chars[startSegment], lengthEachSubdivision))); +	} +	if (lengthSegment < remaining) { +		subBreak += lengthSegment;  	} else { -		subBreak += pdoc->SafeSegment(&ll->chars[subBreak], lengthEachSubdivision); -		if (subBreak >= nextBreak) { -			subBreak = -1; -			return TextSegment(startSegment, nextBreak - startSegment); -		} else { -			return TextSegment(startSegment, subBreak - startSegment); -		} +		subBreak = -1;  	} +	return TextSegment(startSegment, lengthSegment);  }  bool BreakFinder::More() const noexcept { diff --git a/test/unit/testDocument.cxx b/test/unit/testDocument.cxx index 41e3907ae..4a7e20095 100644 --- a/test/unit/testDocument.cxx +++ b/test/unit/testDocument.cxx @@ -486,3 +486,120 @@ TEST_CASE("Words") {  		REQUIRE(!docEndSpace.document.IsWordAt(3, 5));  	}  } + +TEST_CASE("SafeSegment") { +	SECTION("Short") { +		const DocPlus doc("", 0); +		// all encoding: break before or after last space +		const std::string_view text = "12 "; +		size_t length = doc.document.SafeSegment(text); +		REQUIRE(length <= text.length()); +		REQUIRE(text[length - 1] == '2'); +		REQUIRE(text[length] == ' '); +	} + +	SECTION("ASCII") { +		const DocPlus doc("", 0); +		// all encoding: break before or after last space +		std::string_view text = "12 3 \t45"; +		size_t length = doc.document.SafeSegment(text); +		REQUIRE(text[length - 1] == ' '); +		REQUIRE(text[length] == '\t'); + +		// UTF-8 and ASCII: word and punctuation boundary in middle of text +		text = "(IsBreakSpace(text[j]))"; +		length = doc.document.SafeSegment(text); +		REQUIRE(text[length - 1] == 'j'); +		REQUIRE(text[length] == ']'); + +		// UTF-8 and ASCII: word and punctuation boundary near start of text +		text = "(IsBreakSpace"; +		length = doc.document.SafeSegment(text); +		REQUIRE(text[length - 1] == '('); +		REQUIRE(text[length] == 'I'); + +		// UTF-8 and ASCII: word and punctuation boundary near end of text +		text = "IsBreakSpace)"; +		length = doc.document.SafeSegment(text); +		REQUIRE(text[length - 1] == 'e'); +		REQUIRE(text[length] == ')'); + +		// break before last character +		text = "JapaneseJa"; +		length = doc.document.SafeSegment(text); +		REQUIRE(text[length - 1] == 'J'); +		REQUIRE(text[length] == 'a'); +	} + +	SECTION("UTF-8") { +		const DocPlus doc("", CpUtf8); +		// break before last character: no trail byte +		std::string_view text = "JapaneseJa"; +		size_t length = doc.document.SafeSegment(text); +		REQUIRE(text[length - 1] == 'J'); +		REQUIRE(text[length] == 'a'); + +		// break before last character: 1 trail byte +		text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xc2\xa9"; +		length = doc.document.SafeSegment(text); +		REQUIRE(text[length - 1] == '\x9e'); +		REQUIRE(text[length] == '\xc2'); + +		// break before last character: 2 trail bytes +		text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e"; +		length = doc.document.SafeSegment(text); +		REQUIRE(text[length - 1] == '\xac'); +		REQUIRE(text[length] == '\xe8'); + +		// break before last character: 3 trail bytes +		text = "Japanese\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xf0\x9f\x98\x8a"; +		length = doc.document.SafeSegment(text); +		REQUIRE(text[length - 1] == '\x9e'); +		REQUIRE(text[length] == '\xf0'); +	} + +	SECTION("DBCS Shift-JIS") { +		const DocPlus doc("", 932); +		// word and punctuation boundary in middle of text: single byte +		std::string_view text = "(IsBreakSpace(text[j]))"; +		size_t length = doc.document.SafeSegment(text); +		REQUIRE(text[length - 1] == 'j'); +		REQUIRE(text[length] == ']'); + +		// word and punctuation boundary in middle of text: double byte +		text = "(IsBreakSpace(text[\x8c\xea]))"; +		length = doc.document.SafeSegment(text); +		REQUIRE(text[length - 1] == '\xea'); +		REQUIRE(text[length] == ']'); + +		// word and punctuation boundary near start of text +		text = "(IsBreakSpace"; +		length = doc.document.SafeSegment(text); +		REQUIRE(text[length - 1] == '('); +		REQUIRE(text[length] == 'I'); + +		// word and punctuation boundary near end of text: single byte +		text = "IsBreakSpace)"; +		length = doc.document.SafeSegment(text); +		REQUIRE(text[length - 1] == 'e'); +		REQUIRE(text[length] == ')'); + +		// word and punctuation boundary near end of text: double byte +		text = "IsBreakSpace\x8c\xea)"; +		length = doc.document.SafeSegment(text); +		REQUIRE(text[length - 1] == '\xea'); +		REQUIRE(text[length] == ')'); + +		// break before last character: single byte +		text = "JapaneseJa"; +		length = doc.document.SafeSegment(text); +		REQUIRE(text[length - 1] == 'J'); +		REQUIRE(text[length] == 'a'); + +		// break before last character: double byte +		text = "Japanese\x93\xfa\x96\x7b\x8c\xea"; +		length = doc.document.SafeSegment(text); +		REQUIRE(text[length - 1] == '\x7b'); +		REQUIRE(text[length] == '\x8c'); +	} +} | 
