diff options
Diffstat (limited to 'src/Document.cxx')
| -rw-r--r-- | src/Document.cxx | 49 | 
1 files changed, 49 insertions, 0 deletions
| diff --git a/src/Document.cxx b/src/Document.cxx index 08bc24ecf..7b718f272 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -713,6 +713,55 @@ bool SCI_METHOD Document::IsDBCSLeadByte(char ch) const {  	return false;  } +inline bool IsSpaceOrTab(int ch) { +	return ch == ' ' || ch == '\t'; +} + +// Need to break text into segments near lengthSegment but taking into +// account the encoding to not break inside a UTF-8 or DBCS character +// and also trying to avoid breaking inside a pair of combining characters. +// The segment length must always be long enough (more than 4 bytes) +// so that there will be at least one whole character to make a segment. +// For UTF-8, text must consist only of valid whole characters. +// In preference order from best to worst: +//   1) Break after space +//   2) Break before punctuation +//   3) Break after whole character + +int Document::SafeSegment(const char *text, int length, int lengthSegment) { +	if (length <= lengthSegment) +		return length; +	int lastSpaceBreak = -1; +	int lastPunctuationBreak = -1; +	int lastEncodingAllowedBreak = -1; +	for (int j=0; j < lengthSegment;) { +		unsigned char ch = static_cast<unsigned char>(text[j]); +		if (j > 0) { +			if (IsSpaceOrTab(text[j - 1]) && !IsSpaceOrTab(text[j])) { +				lastSpaceBreak = j; +			} +			if (ch < 'A') { +				lastPunctuationBreak = j; +			} +		} +		lastEncodingAllowedBreak = j; + +		if (dbcsCodePage == SC_CP_UTF8) { +			j += (ch < 0x80) ? 1 : BytesFromLead(ch); +		} else if (dbcsCodePage) { +			j += IsDBCSLeadByte(ch) ? 2 : 1; +		} else { +			j++; +		} +	} +	if (lastSpaceBreak >= 0) { +		return lastSpaceBreak; +	} else if (lastPunctuationBreak >= 0) { +		return lastPunctuationBreak; +	} +	return lastEncodingAllowedBreak; +} +  void Document::ModifiedAt(int pos) {  	if (endStyled > pos)  		endStyled = pos; | 
