diff options
| author | Neil <nyamatongwe@gmail.com> | 2016-10-06 15:16:50 +1100 | 
|---|---|---|
| committer | Neil <nyamatongwe@gmail.com> | 2016-10-06 15:16:50 +1100 | 
| commit | 9c0c31495c68c2757bbf95aa3f114d865dff88b8 (patch) | |
| tree | 285098b5edde401cb67d2dea8dad90da2c8e013d | |
| parent | 1967c348184a35007b7fce5da81d7874a51edc3e (diff) | |
| download | scintilla-mirror-9c0c31495c68c2757bbf95aa3f114d865dff88b8.tar.gz | |
Word selection, navigation, and manipulation is now performed on characters
instead of bytes leading to more natural behaviour for multi-byte encodings like
UTF-8.
| -rw-r--r-- | doc/ScintillaDoc.html | 13 | ||||
| -rw-r--r-- | doc/ScintillaHistory.html | 7 | ||||
| -rw-r--r-- | src/Document.cxx | 462 | ||||
| -rw-r--r-- | src/Document.h | 36 | ||||
| -rw-r--r-- | src/EditView.cxx | 25 | 
5 files changed, 396 insertions, 147 deletions
| diff --git a/doc/ScintillaDoc.html b/doc/ScintillaDoc.html index 3b2c480d0..adf3b9907 100644 --- a/doc/ScintillaDoc.html +++ b/doc/ScintillaDoc.html @@ -2322,8 +2322,13 @@ struct Sci_TextToFind {      Line ends are not selected by double clicking but do act as word separators.      </p> -    <p>Words are defined in terms of bytes, not characters so there are some issues with -    UTF-8 and DCBS documents.</p> +    <p>Words are defined in terms of characters and the sets of characters in each category can be customized to an extent. +    The NUL character (0) is always a space as the APIs to set categories use NUL-terminated strings. +    For single-byte encodings a category may be assigned to any character (1 to 0xFF). +    For multi-byte encodings a category may be assigned to characters from 1 to 0x7F with static behaviour from 0x80. +    For UTF-8, characters from 0x80 will use a category based on their Unicode general category. +    For Asian encodings, code pages 932, 936, 949, 950, and 1361, characters from 0x80 are treated as word characters. +    </p>      <p>Identifiers in programming languages are often sequences of words with capitalisation       (aCamelCaseIdentifier) or underscores (an_under_bar_ident) used to mark word boundaries. @@ -2437,7 +2442,7 @@ struct Sci_TextToFind {      </table>      <p><b id="SCI_SETWORDCHARS">SCI_SETWORDCHARS(<unused>, const char *characters)</b><br /> -     This message defines which characters (bytes) are members of the word category. +     This message defines which characters are members of the word category.       The character categories are set to default values before processing this function.      For example, if you don't allow '_' in your set of characters      use:<br /> @@ -2449,6 +2454,8 @@ struct Sci_TextToFind {       If the characters parameter is 0 then the length that should be allocated       to store the entire set is returned.</p> +    <p>For multi-byte encodings, this API will not return meaningful values for 0x80 and above.</p> +      <p><b id="SCI_SETWHITESPACECHARS">SCI_SETWHITESPACECHARS(<unused>, const char *characters)</b><br />      <b id="SCI_GETWHITESPACECHARS">SCI_GETWHITESPACECHARS(<unused>, char *characters) → int</b><br />       Similar to <code>SCI_SETWORDCHARS</code>, this message allows the user to define which chars Scintilla considers diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html index fb004776b..a134035dc 100644 --- a/doc/ScintillaHistory.html +++ b/doc/ScintillaHistory.html @@ -523,6 +523,13 @@  	Released 4 September 2016.  	</li>  	<li> +	Word selection, navigation, and manipulation is now performed on characters instead of bytes +	leading to more natural behaviour for multi-byte encodings like UTF-8. +	For UTF-8 characters 0x80 and above, classification into word; punctuation; space; or line-end +	is based on the Unicode general category of the character and is not customizable. +	<a href="http://sourceforge.net/p/scintilla/bugs/1832/">Bug #1832</a>. +	</li> +	<li>  	Two enums changed in Scintilla.iface which may lead to changed bindings.  	There were 2 FontQuality enums and the first is now PhasesDraw.  	The prefix for FoldAction was SC_FOLDACTION and is now SC_FOLDACTION_ diff --git a/src/Document.cxx b/src/Document.cxx index f10e40aad..58f663376 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -26,6 +26,7 @@  #include "Scintilla.h"  #include "CharacterSet.h" +#include "CharacterCategory.h"  #include "Position.h"  #include "SplitVector.h"  #include "Partitioning.h" @@ -44,10 +45,6 @@  using namespace Scintilla;  #endif -static inline bool IsPunctuation(char ch) { -	return IsASCII(ch) && ispunct(ch); -} -  void LexInterface::Colourise(int start, int end) {  	if (pdoc && instance && !performingStyle) {  		// Protect against reentrance, which may occur, for example, when @@ -771,6 +768,77 @@ bool Document::NextCharacter(int &pos, int moveDir) const {  	}  } +Document::CharacterExtracted Document::CharacterAfter(int position) const { +	if (position >= Length()) { +		return CharacterExtracted(unicodeReplacementChar, 0); +	} +	const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(position)); +	if (!dbcsCodePage || UTF8IsAscii(leadByte)) { +		// Common case: ASCII character +		return CharacterExtracted(leadByte, 1); +	} +	if (SC_CP_UTF8 == dbcsCodePage) { +		const int widthCharBytes = UTF8BytesOfLead[leadByte]; +		unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 }; +		for (int b = 1; b<widthCharBytes; b++) +			charBytes[b] = static_cast<unsigned char>(cb.CharAt(position + b)); +		int utf8status = UTF8Classify(charBytes, widthCharBytes); +		if (utf8status & UTF8MaskInvalid) { +			// Treat as invalid and use up just one byte +			return CharacterExtracted(unicodeReplacementChar, 1); +		} else { +			return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth); +		} +	} else { +		if (IsDBCSLeadByte(leadByte) && ((position + 1) < Length())) { +			return CharacterExtracted::DBCS(leadByte, static_cast<unsigned char>(cb.CharAt(position + 1))); +		} else { +			return CharacterExtracted(leadByte, 1); +		} +	} +} + +Document::CharacterExtracted Document::CharacterBefore(int position) const { +	if (position <= 0) { +		return CharacterExtracted(unicodeReplacementChar, 0); +	} +	const unsigned char previousByte = static_cast<unsigned char>(cb.CharAt(position - 1)); +	if (0 == dbcsCodePage) { +		return CharacterExtracted(previousByte, 1); +	} +	if (SC_CP_UTF8 == dbcsCodePage) { +		if (UTF8IsAscii(previousByte)) { +			return CharacterExtracted(previousByte, 1); +		} +		position--; +		// If previousByte is not a trail byte then its invalid +		if (UTF8IsTrailByte(previousByte)) { +			// If previousByte is a trail byte in a valid UTF-8 character then find start of character +			int startUTF = position; +			int endUTF = position; +			if (InGoodUTF8(position, startUTF, endUTF)) { +				const int widthCharBytes = endUTF - startUTF; +				unsigned char charBytes[UTF8MaxBytes] = { 0, 0, 0, 0 }; +				for (int b = 0; b<widthCharBytes; b++) +					charBytes[b] = static_cast<unsigned char>(cb.CharAt(startUTF + b)); +				int utf8status = UTF8Classify(charBytes, widthCharBytes); +				if (utf8status & UTF8MaskInvalid) { +					// Treat as invalid and use up just one byte +					return CharacterExtracted(unicodeReplacementChar, 1); +				} else { +					return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth); +				} +			} +			// Else invalid UTF-8 so return position of isolated trail byte +		} +		return CharacterExtracted(unicodeReplacementChar, 1); +	} else { +		// Moving backwards in DBCS is complex so use NextPosition +		const int posStartCharacter = NextPosition(position, -1); +		return CharacterAfter(posStartCharacter); +	} +} +  // Return -1  on out-of-bounds  Sci_Position SCI_METHOD Document::GetRelativePosition(Sci_Position positionStart, Sci_Position characterOffset) const {  	int pos = positionStart; @@ -1485,28 +1553,104 @@ int Document::ParaDown(int pos) const {  		return LineEnd(line-1);  } -CharClassify::cc Document::WordCharClass(unsigned char ch) const { -	if ((SC_CP_UTF8 == dbcsCodePage) && (!UTF8IsAscii(ch))) -		return CharClassify::ccWord; -	return charClass.GetClass(ch); +bool Document::IsASCIIWordByte(unsigned char ch) const { +	if (IsASCII(ch)) { +		return charClass.GetClass(ch) == CharClassify::ccWord; +	} else { +		return false; +	} +} + +CharClassify::cc Document::WordCharacterClass(unsigned int ch) const { +	if (dbcsCodePage && (!UTF8IsAscii(ch))) { +		if (SC_CP_UTF8 == dbcsCodePage) { +			// Use hard coded Unicode class +			const CharacterCategory cc = CategoriseCharacter(ch); +			switch (cc) { + +				// Separator, Line/Paragraph +			case ccZl: +			case ccZp: +				return CharClassify::ccNewLine; + +				// Separator, Space +			case ccZs: +				// Other +			case ccCc: +			case ccCf: +			case ccCs: +			case ccCo: +			case ccCn: +				return CharClassify::ccSpace; + +				// Letter +			case ccLu: +			case ccLl: +			case ccLt: +			case ccLm: +			case ccLo: +				// Number +			case ccNd: +			case ccNl: +			case ccNo: +				// Mark - includes combining diacritics +			case ccMn: +			case ccMc: +			case ccMe: +				return CharClassify::ccWord; + +				// Punctuation +			case ccPc: +			case ccPd: +			case ccPs: +			case ccPe: +			case ccPi: +			case ccPf: +			case ccPo: +				// Symbol +			case ccSm: +			case ccSc: +			case ccSk: +			case ccSo: +				return CharClassify::ccPunctuation; + +			} +		} else { +			// Asian DBCS +			return CharClassify::ccWord; +		} +	} +	return charClass.GetClass(static_cast<unsigned char>(ch));  }  /**   * Used by commmands that want to select whole words.   * Finds the start of word at pos when delta < 0 or the end of the word when delta >= 0.   */ -int Document::ExtendWordSelect(int pos, int delta, bool onlyWordCharacters) { +int Document::ExtendWordSelect(int pos, int delta, bool onlyWordCharacters) const {  	CharClassify::cc ccStart = CharClassify::ccWord;  	if (delta < 0) { -		if (!onlyWordCharacters) -			ccStart = WordCharClass(cb.CharAt(pos-1)); -		while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == ccStart)) -			pos--; +		if (!onlyWordCharacters) { +			const CharacterExtracted ce = CharacterBefore(pos); +			ccStart = WordCharacterClass(ce.character); +		} +		while (pos > 0) { +			const CharacterExtracted ce = CharacterBefore(pos); +			if (WordCharacterClass(ce.character) != ccStart) +				break; +			pos -= ce.widthBytes; +		}  	} else { -		if (!onlyWordCharacters && pos < Length()) -			ccStart = WordCharClass(cb.CharAt(pos)); -		while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == ccStart)) -			pos++; +		if (!onlyWordCharacters && pos < Length()) { +			const CharacterExtracted ce = CharacterAfter(pos); +			ccStart = WordCharacterClass(ce.character); +		} +		while (pos < Length()) { +			const CharacterExtracted ce = CharacterAfter(pos); +			if (WordCharacterClass(ce.character) != ccStart) +				break; +			pos += ce.widthBytes; +		}  	}  	return MovePositionOutsideChar(pos, delta, true);  } @@ -1518,22 +1662,39 @@ int Document::ExtendWordSelect(int pos, int delta, bool onlyWordCharacters) {   * additional movement to transit white space.   * Used by cursor movement by word commands.   */ -int Document::NextWordStart(int pos, int delta) { +int Document::NextWordStart(int pos, int delta) const {  	if (delta < 0) { -		while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == CharClassify::ccSpace)) -			pos--; +		while (pos > 0) { +			const CharacterExtracted ce = CharacterBefore(pos); +			if (WordCharacterClass(ce.character) != CharClassify::ccSpace) +				break; +			pos -= ce.widthBytes; +		}  		if (pos > 0) { -			CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos-1)); -			while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == ccStart)) { -				pos--; +			CharacterExtracted ce = CharacterBefore(pos); +			const CharClassify::cc ccStart = WordCharacterClass(ce.character); +			while (pos > 0) { +				ce = CharacterBefore(pos); +				if (WordCharacterClass(ce.character) != ccStart) +					break; +				pos -= ce.widthBytes;  			}  		}  	} else { -		CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos)); -		while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == ccStart)) -			pos++; -		while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == CharClassify::ccSpace)) -			pos++; +		CharacterExtracted ce = CharacterAfter(pos); +		const CharClassify::cc ccStart = WordCharacterClass(ce.character); +		while (pos < Length()) { +			ce = CharacterAfter(pos); +			if (WordCharacterClass(ce.character) != ccStart) +				break; +			pos += ce.widthBytes; +		} +		while (pos < Length()) { +			ce = CharacterAfter(pos); +			if (WordCharacterClass(ce.character) != CharClassify::ccSpace) +				break; +			pos += ce.widthBytes; +		}  	}  	return pos;  } @@ -1545,27 +1706,41 @@ int Document::NextWordStart(int pos, int delta) {   * additional movement to transit white space.   * Used by cursor movement by word commands.   */ -int Document::NextWordEnd(int pos, int delta) { +int Document::NextWordEnd(int pos, int delta) const {  	if (delta < 0) {  		if (pos > 0) { -			CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos-1)); +			CharacterExtracted ce = CharacterBefore(pos); +			CharClassify::cc ccStart = WordCharacterClass(ce.character);  			if (ccStart != CharClassify::ccSpace) { -				while (pos > 0 && WordCharClass(cb.CharAt(pos - 1)) == ccStart) { -					pos--; +				while (pos > 0) { +					ce = CharacterBefore(pos); +					if (WordCharacterClass(ce.character) != ccStart) +						break; +					pos -= ce.widthBytes;  				}  			} -			while (pos > 0 && WordCharClass(cb.CharAt(pos - 1)) == CharClassify::ccSpace) { -				pos--; +			while (pos > 0) { +				ce = CharacterBefore(pos); +				if (WordCharacterClass(ce.character) != CharClassify::ccSpace) +					break; +				pos -= ce.widthBytes;  			}  		}  	} else { -		while (pos < Length() && WordCharClass(cb.CharAt(pos)) == CharClassify::ccSpace) { -			pos++; +		while (pos < Length()) { +			CharacterExtracted ce = CharacterAfter(pos); +			if (WordCharacterClass(ce.character) != CharClassify::ccSpace) +				break; +			pos += ce.widthBytes;  		}  		if (pos < Length()) { -			CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos)); -			while (pos < Length() && WordCharClass(cb.CharAt(pos)) == ccStart) { -				pos++; +			CharacterExtracted ce = CharacterAfter(pos); +			CharClassify::cc ccStart = WordCharacterClass(ce.character); +			while (pos < Length()) { +				ce = CharacterAfter(pos); +				if (WordCharacterClass(ce.character) != ccStart) +					break; +				pos += ce.widthBytes;  			}  		}  	} @@ -1577,10 +1752,15 @@ int Document::NextWordEnd(int pos, int delta) {   * the previous character is of a different character class.   */  bool Document::IsWordStartAt(int pos) const { +	if (pos >= Length()) +		return false;  	if (pos > 0) { -		CharClassify::cc ccPos = WordCharClass(CharAt(pos)); +		const CharacterExtracted cePos = CharacterAfter(pos); +		const CharClassify::cc ccPos = WordCharacterClass(cePos.character); +		const CharacterExtracted cePrev = CharacterBefore(pos); +		const CharClassify::cc ccPrev = WordCharacterClass(cePrev.character);  		return (ccPos == CharClassify::ccWord || ccPos == CharClassify::ccPunctuation) && -			(ccPos != WordCharClass(CharAt(pos - 1))); +			(ccPos != ccPrev);  	}  	return true;  } @@ -1590,10 +1770,15 @@ bool Document::IsWordStartAt(int pos) const {   * the next character is of a different character class.   */  bool Document::IsWordEndAt(int pos) const { +	if (pos <= 0) +		return false;  	if (pos < Length()) { -		CharClassify::cc ccPrev = WordCharClass(CharAt(pos-1)); +		const CharacterExtracted cePos = CharacterAfter(pos); +		const CharClassify::cc ccPos = WordCharacterClass(cePos.character); +		const CharacterExtracted cePrev = CharacterBefore(pos); +		const CharClassify::cc ccPrev = WordCharacterClass(cePrev.character);  		return (ccPrev == CharClassify::ccWord || ccPrev == CharClassify::ccPunctuation) && -			(ccPrev != WordCharClass(CharAt(pos))); +			(ccPrev != ccPos);  	}  	return true;  } @@ -2075,96 +2260,137 @@ void Document::NotifyModified(DocModification mh) {  	}  } -bool Document::IsWordPartSeparator(char ch) const { -	return (WordCharClass(ch) == CharClassify::ccWord) && IsPunctuation(ch); +// Used for word part navigation. +static bool IsASCIIPunctuationCharacter(unsigned int ch) { +	switch (ch) { +	case '!': +	case '"': +	case '#': +	case '$': +	case '%': +	case '&': +	case '\'': +	case '(': +	case ')': +	case '*': +	case '+': +	case ',': +	case '-': +	case '.': +	case '/': +	case ':': +	case ';': +	case '<': +	case '=': +	case '>': +	case '?': +	case '@': +	case '[': +	case '\\': +	case ']': +	case '^': +	case '_': +	case '`': +	case '{': +	case '|': +	case '}': +	case '~': +		return true; +	default: +		return false; +	} +} + +bool Document::IsWordPartSeparator(unsigned int ch) const { +	return (WordCharacterClass(ch) == CharClassify::ccWord) && IsASCIIPunctuationCharacter(ch);  } -int Document::WordPartLeft(int pos) { +int Document::WordPartLeft(int pos) const {  	if (pos > 0) { -		--pos; -		char startChar = cb.CharAt(pos); -		if (IsWordPartSeparator(startChar)) { -			while (pos > 0 && IsWordPartSeparator(cb.CharAt(pos))) { -				--pos; +		pos -= CharacterBefore(pos).widthBytes; +		CharacterExtracted ceStart = CharacterAfter(pos); +		if (IsWordPartSeparator(ceStart.character)) { +			while (pos > 0 && IsWordPartSeparator(CharacterAfter(pos).character)) { +				pos -= CharacterBefore(pos).widthBytes;  			}  		}  		if (pos > 0) { -			startChar = cb.CharAt(pos); -			--pos; -			if (IsLowerCase(startChar)) { -				while (pos > 0 && IsLowerCase(cb.CharAt(pos))) -					--pos; -				if (!IsUpperCase(cb.CharAt(pos)) && !IsLowerCase(cb.CharAt(pos))) -					++pos; -			} else if (IsUpperCase(startChar)) { -				while (pos > 0 && IsUpperCase(cb.CharAt(pos))) -					--pos; -				if (!IsUpperCase(cb.CharAt(pos))) -					++pos; -			} else if (IsADigit(startChar)) { -				while (pos > 0 && IsADigit(cb.CharAt(pos))) -					--pos; -				if (!IsADigit(cb.CharAt(pos))) -					++pos; -			} else if (IsPunctuation(startChar)) { -				while (pos > 0 && IsPunctuation(cb.CharAt(pos))) -					--pos; -				if (!IsPunctuation(cb.CharAt(pos))) -					++pos; -			} else if (isspacechar(startChar)) { -				while (pos > 0 && isspacechar(cb.CharAt(pos))) -					--pos; -				if (!isspacechar(cb.CharAt(pos))) -					++pos; -			} else if (!IsASCII(startChar)) { -				while (pos > 0 && !IsASCII(cb.CharAt(pos))) -					--pos; -				if (IsASCII(cb.CharAt(pos))) -					++pos; +			ceStart = CharacterAfter(pos); +			pos -= CharacterBefore(pos).widthBytes; +			if (IsLowerCase(ceStart.character)) { +				while (pos > 0 && IsLowerCase(CharacterAfter(pos).character)) +					pos -= CharacterBefore(pos).widthBytes; +				if (!IsUpperCase(CharacterAfter(pos).character) && !IsLowerCase(CharacterAfter(pos).character)) +					pos += CharacterAfter(pos).widthBytes; +			} else if (IsUpperCase(ceStart.character)) { +				while (pos > 0 && IsUpperCase(CharacterAfter(pos).character)) +					pos -= CharacterBefore(pos).widthBytes; +				if (!IsUpperCase(CharacterAfter(pos).character)) +					pos += CharacterAfter(pos).widthBytes; +			} else if (IsADigit(ceStart.character)) { +				while (pos > 0 && IsADigit(CharacterAfter(pos).character)) +					pos -= CharacterBefore(pos).widthBytes; +				if (!IsADigit(CharacterAfter(pos).character)) +					pos += CharacterAfter(pos).widthBytes; +			} else if (IsASCIIPunctuationCharacter(ceStart.character)) { +				while (pos > 0 && IsASCIIPunctuationCharacter(CharacterAfter(pos).character)) +					pos -= CharacterBefore(pos).widthBytes; +				if (!IsASCIIPunctuationCharacter(CharacterAfter(pos).character)) +					pos += CharacterAfter(pos).widthBytes; +			} else if (isspacechar(ceStart.character)) { +				while (pos > 0 && isspacechar(CharacterAfter(pos).character)) +					pos -= CharacterBefore(pos).widthBytes; +				if (!isspacechar(CharacterAfter(pos).character)) +					pos += CharacterAfter(pos).widthBytes; +			} else if (!IsASCII(ceStart.character)) { +				while (pos > 0 && !IsASCII(CharacterAfter(pos).character)) +					pos -= CharacterBefore(pos).widthBytes; +				if (IsASCII(CharacterAfter(pos).character)) +					pos += CharacterAfter(pos).widthBytes;  			} else { -				++pos; +				pos += CharacterAfter(pos).widthBytes;  			}  		}  	}  	return pos;  } -int Document::WordPartRight(int pos) { -	char startChar = cb.CharAt(pos); -	int length = Length(); -	if (IsWordPartSeparator(startChar)) { -		while (pos < length && IsWordPartSeparator(cb.CharAt(pos))) -			++pos; -		startChar = cb.CharAt(pos); -	} -	if (!IsASCII(startChar)) { -		while (pos < length && !IsASCII(cb.CharAt(pos))) -			++pos; -	} else if (IsLowerCase(startChar)) { -		while (pos < length && IsLowerCase(cb.CharAt(pos))) -			++pos; -	} else if (IsUpperCase(startChar)) { -		if (IsLowerCase(cb.CharAt(pos + 1))) { -			++pos; -			while (pos < length && IsLowerCase(cb.CharAt(pos))) -				++pos; +int Document::WordPartRight(int pos) const { +	CharacterExtracted ceStart = CharacterAfter(pos); +	const int length = Length(); +	if (IsWordPartSeparator(ceStart.character)) { +		while (pos < length && IsWordPartSeparator(CharacterAfter(pos).character)) +			pos += CharacterAfter(pos).widthBytes; +		ceStart = CharacterAfter(pos); +	} +	if (!IsASCII(ceStart.character)) { +		while (pos < length && !IsASCII(CharacterAfter(pos).character)) +			pos += CharacterAfter(pos).widthBytes; +	} else if (IsLowerCase(ceStart.character)) { +		while (pos < length && IsLowerCase(CharacterAfter(pos).character)) +			pos += CharacterAfter(pos).widthBytes; +	} else if (IsUpperCase(ceStart.character)) { +		if (IsLowerCase(CharacterAfter(pos + ceStart.widthBytes).character)) { +			pos += CharacterAfter(pos).widthBytes; +			while (pos < length && IsLowerCase(CharacterAfter(pos).character)) +				pos += CharacterAfter(pos).widthBytes;  		} else { -			while (pos < length && IsUpperCase(cb.CharAt(pos))) -				++pos; -		} -		if (IsLowerCase(cb.CharAt(pos)) && IsUpperCase(cb.CharAt(pos - 1))) -			--pos; -	} else if (IsADigit(startChar)) { -		while (pos < length && IsADigit(cb.CharAt(pos))) -			++pos; -	} else if (IsPunctuation(startChar)) { -		while (pos < length && IsPunctuation(cb.CharAt(pos))) -			++pos; -	} else if (isspacechar(startChar)) { -		while (pos < length && isspacechar(cb.CharAt(pos))) -			++pos; +			while (pos < length && IsUpperCase(CharacterAfter(pos).character)) +				pos += CharacterAfter(pos).widthBytes; +		} +		if (IsLowerCase(CharacterAfter(pos).character) && IsUpperCase(CharacterBefore(pos).character)) +			pos -= CharacterBefore(pos).widthBytes; +	} else if (IsADigit(ceStart.character)) { +		while (pos < length && IsADigit(CharacterAfter(pos).character)) +			pos += CharacterAfter(pos).widthBytes; +	} else if (IsASCIIPunctuationCharacter(ceStart.character)) { +		while (pos < length && IsASCIIPunctuationCharacter(CharacterAfter(pos).character)) +			pos += CharacterAfter(pos).widthBytes; +	} else if (isspacechar(ceStart.character)) { +		while (pos < length && isspacechar(CharacterAfter(pos).character)) +			pos += CharacterAfter(pos).widthBytes;  	} else { -		++pos; +		pos += CharacterAfter(pos).widthBytes;  	}  	return pos;  } diff --git a/src/Document.h b/src/Document.h index d31465f62..c0a0bb808 100644 --- a/src/Document.h +++ b/src/Document.h @@ -238,6 +238,18 @@ private:  public: +	struct CharacterExtracted { +		unsigned int character; +		unsigned int widthBytes; +		CharacterExtracted(unsigned int character_, unsigned int widthBytes_) : +			character(character_), widthBytes(widthBytes_) { +		} +		// For DBCS characters turn 2 bytes into an int +		static CharacterExtracted DBCS(unsigned char lead, unsigned char trail) { +			return CharacterExtracted((lead << 8) | trail, 2); +		} +	}; +  	LexInterface *pli;  	int eolMode; @@ -284,6 +296,8 @@ public:  	int MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd=true) const;  	int NextPosition(int pos, int moveDir) const;  	bool NextCharacter(int &pos, int moveDir) const;	// Returns true if pos changed +	Document::CharacterExtracted CharacterAfter(int position) const; +	Document::CharacterExtracted CharacterBefore(int position) const;  	Sci_Position SCI_METHOD GetRelativePosition(Sci_Position positionStart, Sci_Position characterOffset) const;  	int GetRelativePositionUTF16(int positionStart, int characterOffset) const;  	int SCI_METHOD GetCharacterAndWidth(Sci_Position position, Sci_Position *pWidth) const; @@ -373,19 +387,12 @@ public:  	void GetHighlightDelimiters(HighlightDelimiter &hDelimiter, int line, int lastLine);  	void Indent(bool forwards); -	int ExtendWordSelect(int pos, int delta, bool onlyWordCharacters=false); -	int NextWordStart(int pos, int delta); -	int NextWordEnd(int pos, int delta); +	int ExtendWordSelect(int pos, int delta, bool onlyWordCharacters=false) const; +	int NextWordStart(int pos, int delta) const; +	int NextWordEnd(int pos, int delta) const;  	Sci_Position SCI_METHOD Length() const { return cb.Length(); }  	void Allocate(int newSize) { cb.Allocate(newSize); } -	struct CharacterExtracted { -		unsigned int character; -		unsigned int widthBytes; -		CharacterExtracted(unsigned int character_, unsigned int widthBytes_) : -			character(character_), widthBytes(widthBytes_) { -		} -	};  	CharacterExtracted ExtractCharacter(int position) const;  	bool IsWordStartAt(int pos) const; @@ -437,10 +444,11 @@ public:  	bool AddWatcher(DocWatcher *watcher, void *userData);  	bool RemoveWatcher(DocWatcher *watcher, void *userData); -	CharClassify::cc WordCharClass(unsigned char ch) const; -	bool IsWordPartSeparator(char ch) const; -	int WordPartLeft(int pos); -	int WordPartRight(int pos); +	bool IsASCIIWordByte(unsigned char ch) const; +	CharClassify::cc WordCharacterClass(unsigned int ch) const; +	bool IsWordPartSeparator(unsigned int ch) const; +	int WordPartLeft(int pos) const; +	int WordPartRight(int pos) const;  	int ExtendStyleRange(int pos, int delta, bool singleLine = false);  	bool IsWhiteLine(int line) const;  	int ParaUp(int pos) const; diff --git a/src/EditView.cxx b/src/EditView.cxx index 92c341d8f..e6cd8fcfe 100644 --- a/src/EditView.cxx +++ b/src/EditView.cxx @@ -25,6 +25,7 @@  #include "Scintilla.h"  #include "StringCopy.h" +#include "CharacterSet.h"  #include "Position.h"  #include "SplitVector.h"  #include "Partitioning.h" @@ -389,16 +390,16 @@ void EditView::LayoutLine(const EditModel &model, int line, Surface *surface, co  					(ll->chars[numCharsInLine] == chDoc);  				else if (vstyle.styles[ll->styles[numCharsInLine]].caseForce == Style::caseLower)  					allSame = allSame && -					(ll->chars[numCharsInLine] == static_cast<char>(tolower(chDoc))); +					(ll->chars[numCharsInLine] == MakeLowerCase(chDoc));  				else if (vstyle.styles[ll->styles[numCharsInLine]].caseForce == Style::caseUpper)  					allSame = allSame && -					(ll->chars[numCharsInLine] == static_cast<char>(toupper(chDoc))); +					(ll->chars[numCharsInLine] == MakeUpperCase(chDoc));  				else	{ // Style::caseCamel -					if ((model.pdoc->WordCharClass(ll->chars[numCharsInLine]) == CharClassify::ccWord) && -					  ((numCharsInLine == 0) || (model.pdoc->WordCharClass(ll->chars[numCharsInLine - 1]) != CharClassify::ccWord))) { -						allSame = allSame && (ll->chars[numCharsInLine] == static_cast<char>(toupper(chDoc))); +					if ((model.pdoc->IsASCIIWordByte(ll->chars[numCharsInLine])) && +					  ((numCharsInLine == 0) || (!model.pdoc->IsASCIIWordByte(ll->chars[numCharsInLine - 1])))) { +						allSame = allSame && (ll->chars[numCharsInLine] == MakeUpperCase(chDoc));  					} else { -						allSame = allSame && (ll->chars[numCharsInLine] == static_cast<char>(tolower(chDoc))); +						allSame = allSame && (ll->chars[numCharsInLine] == MakeLowerCase(chDoc));  					}  				}  				numCharsInLine++; @@ -440,15 +441,15 @@ void EditView::LayoutLine(const EditModel &model, int line, Surface *surface, co  			for (int charInLine = 0; charInLine<lineLength; charInLine++) {  				char chDoc = ll->chars[charInLine];  				if (vstyle.styles[ll->styles[charInLine]].caseForce == Style::caseUpper) -					ll->chars[charInLine] = static_cast<char>(toupper(chDoc)); +					ll->chars[charInLine] = static_cast<char>(MakeUpperCase(chDoc));  				else if (vstyle.styles[ll->styles[charInLine]].caseForce == Style::caseLower) -					ll->chars[charInLine] = static_cast<char>(tolower(chDoc)); +					ll->chars[charInLine] = static_cast<char>(MakeLowerCase(chDoc));  				else if (vstyle.styles[ll->styles[charInLine]].caseForce == Style::caseCamel) { -					if ((model.pdoc->WordCharClass(ll->chars[charInLine]) == CharClassify::ccWord) && -					  ((charInLine == 0) || (model.pdoc->WordCharClass(ll->chars[charInLine - 1]) != CharClassify::ccWord))) { -						ll->chars[charInLine] = static_cast<char>(toupper(chDoc)); +					if ((model.pdoc->IsASCIIWordByte(ll->chars[charInLine])) && +					  ((charInLine == 0) || (!model.pdoc->IsASCIIWordByte(ll->chars[charInLine - 1])))) { +						ll->chars[charInLine] = static_cast<char>(MakeUpperCase(chDoc));  					} else { -						ll->chars[charInLine] = static_cast<char>(tolower(chDoc)); +						ll->chars[charInLine] = static_cast<char>(MakeLowerCase(chDoc));  					}  				}  			} | 
