diff options
| author | Neil <nyamatongwe@gmail.com> | 2016-10-06 15:16:50 +1100 | 
|---|---|---|
| committer | Neil <nyamatongwe@gmail.com> | 2016-10-06 15:16:50 +1100 | 
| commit | 9c0c31495c68c2757bbf95aa3f114d865dff88b8 (patch) | |
| tree | 285098b5edde401cb67d2dea8dad90da2c8e013d /src/Document.cxx | |
| parent | 1967c348184a35007b7fce5da81d7874a51edc3e (diff) | |
| download | scintilla-mirror-9c0c31495c68c2757bbf95aa3f114d865dff88b8.tar.gz | |
Word selection, navigation, and manipulation is now performed on characters
instead of bytes leading to more natural behaviour for multi-byte encodings like
UTF-8.
Diffstat (limited to 'src/Document.cxx')
| -rw-r--r-- | src/Document.cxx | 462 | 
1 files changed, 344 insertions, 118 deletions
| diff --git a/src/Document.cxx b/src/Document.cxx index f10e40aad..58f663376 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -26,6 +26,7 @@  #include "Scintilla.h"  #include "CharacterSet.h" +#include "CharacterCategory.h"  #include "Position.h"  #include "SplitVector.h"  #include "Partitioning.h" @@ -44,10 +45,6 @@  using namespace Scintilla;  #endif -static inline bool IsPunctuation(char ch) { -	return IsASCII(ch) && ispunct(ch); -} -  void LexInterface::Colourise(int start, int end) {  	if (pdoc && instance && !performingStyle) {  		// Protect against reentrance, which may occur, for example, when @@ -771,6 +768,77 @@ bool Document::NextCharacter(int &pos, int moveDir) const {  	}  } +Document::CharacterExtracted Document::CharacterAfter(int position) const { +	if (position >= Length()) { +		return CharacterExtracted(unicodeReplacementChar, 0); +	} +	const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(position)); +	if (!dbcsCodePage || UTF8IsAscii(leadByte)) { +		// Common case: ASCII character +		return CharacterExtracted(leadByte, 1); +	} +	if (SC_CP_UTF8 == dbcsCodePage) { +		const int widthCharBytes = UTF8BytesOfLead[leadByte]; +		unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 }; +		for (int b = 1; b<widthCharBytes; b++) +			charBytes[b] = static_cast<unsigned char>(cb.CharAt(position + b)); +		int utf8status = UTF8Classify(charBytes, widthCharBytes); +		if (utf8status & UTF8MaskInvalid) { +			// Treat as invalid and use up just one byte +			return CharacterExtracted(unicodeReplacementChar, 1); +		} else { +			return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth); +		} +	} else { +		if (IsDBCSLeadByte(leadByte) && ((position + 1) < Length())) { +			return CharacterExtracted::DBCS(leadByte, static_cast<unsigned char>(cb.CharAt(position + 1))); +		} else { +			return CharacterExtracted(leadByte, 1); +		} +	} +} + +Document::CharacterExtracted Document::CharacterBefore(int position) const { +	if (position <= 0) { +		return CharacterExtracted(unicodeReplacementChar, 0); +	} +	const unsigned char previousByte = static_cast<unsigned char>(cb.CharAt(position - 1)); +	if (0 == dbcsCodePage) { +		return CharacterExtracted(previousByte, 1); +	} +	if (SC_CP_UTF8 == dbcsCodePage) { +		if (UTF8IsAscii(previousByte)) { +			return CharacterExtracted(previousByte, 1); +		} +		position--; +		// If previousByte is not a trail byte then its invalid +		if (UTF8IsTrailByte(previousByte)) { +			// If previousByte is a trail byte in a valid UTF-8 character then find start of character +			int startUTF = position; +			int endUTF = position; +			if (InGoodUTF8(position, startUTF, endUTF)) { +				const int widthCharBytes = endUTF - startUTF; +				unsigned char charBytes[UTF8MaxBytes] = { 0, 0, 0, 0 }; +				for (int b = 0; b<widthCharBytes; b++) +					charBytes[b] = static_cast<unsigned char>(cb.CharAt(startUTF + b)); +				int utf8status = UTF8Classify(charBytes, widthCharBytes); +				if (utf8status & UTF8MaskInvalid) { +					// Treat as invalid and use up just one byte +					return CharacterExtracted(unicodeReplacementChar, 1); +				} else { +					return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth); +				} +			} +			// Else invalid UTF-8 so return position of isolated trail byte +		} +		return CharacterExtracted(unicodeReplacementChar, 1); +	} else { +		// Moving backwards in DBCS is complex so use NextPosition +		const int posStartCharacter = NextPosition(position, -1); +		return CharacterAfter(posStartCharacter); +	} +} +  // Return -1  on out-of-bounds  Sci_Position SCI_METHOD Document::GetRelativePosition(Sci_Position positionStart, Sci_Position characterOffset) const {  	int pos = positionStart; @@ -1485,28 +1553,104 @@ int Document::ParaDown(int pos) const {  		return LineEnd(line-1);  } -CharClassify::cc Document::WordCharClass(unsigned char ch) const { -	if ((SC_CP_UTF8 == dbcsCodePage) && (!UTF8IsAscii(ch))) -		return CharClassify::ccWord; -	return charClass.GetClass(ch); +bool Document::IsASCIIWordByte(unsigned char ch) const { +	if (IsASCII(ch)) { +		return charClass.GetClass(ch) == CharClassify::ccWord; +	} else { +		return false; +	} +} + +CharClassify::cc Document::WordCharacterClass(unsigned int ch) const { +	if (dbcsCodePage && (!UTF8IsAscii(ch))) { +		if (SC_CP_UTF8 == dbcsCodePage) { +			// Use hard coded Unicode class +			const CharacterCategory cc = CategoriseCharacter(ch); +			switch (cc) { + +				// Separator, Line/Paragraph +			case ccZl: +			case ccZp: +				return CharClassify::ccNewLine; + +				// Separator, Space +			case ccZs: +				// Other +			case ccCc: +			case ccCf: +			case ccCs: +			case ccCo: +			case ccCn: +				return CharClassify::ccSpace; + +				// Letter +			case ccLu: +			case ccLl: +			case ccLt: +			case ccLm: +			case ccLo: +				// Number +			case ccNd: +			case ccNl: +			case ccNo: +				// Mark - includes combining diacritics +			case ccMn: +			case ccMc: +			case ccMe: +				return CharClassify::ccWord; + +				// Punctuation +			case ccPc: +			case ccPd: +			case ccPs: +			case ccPe: +			case ccPi: +			case ccPf: +			case ccPo: +				// Symbol +			case ccSm: +			case ccSc: +			case ccSk: +			case ccSo: +				return CharClassify::ccPunctuation; + +			} +		} else { +			// Asian DBCS +			return CharClassify::ccWord; +		} +	} +	return charClass.GetClass(static_cast<unsigned char>(ch));  }  /**   * Used by commmands that want to select whole words.   * Finds the start of word at pos when delta < 0 or the end of the word when delta >= 0.   */ -int Document::ExtendWordSelect(int pos, int delta, bool onlyWordCharacters) { +int Document::ExtendWordSelect(int pos, int delta, bool onlyWordCharacters) const {  	CharClassify::cc ccStart = CharClassify::ccWord;  	if (delta < 0) { -		if (!onlyWordCharacters) -			ccStart = WordCharClass(cb.CharAt(pos-1)); -		while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == ccStart)) -			pos--; +		if (!onlyWordCharacters) { +			const CharacterExtracted ce = CharacterBefore(pos); +			ccStart = WordCharacterClass(ce.character); +		} +		while (pos > 0) { +			const CharacterExtracted ce = CharacterBefore(pos); +			if (WordCharacterClass(ce.character) != ccStart) +				break; +			pos -= ce.widthBytes; +		}  	} else { -		if (!onlyWordCharacters && pos < Length()) -			ccStart = WordCharClass(cb.CharAt(pos)); -		while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == ccStart)) -			pos++; +		if (!onlyWordCharacters && pos < Length()) { +			const CharacterExtracted ce = CharacterAfter(pos); +			ccStart = WordCharacterClass(ce.character); +		} +		while (pos < Length()) { +			const CharacterExtracted ce = CharacterAfter(pos); +			if (WordCharacterClass(ce.character) != ccStart) +				break; +			pos += ce.widthBytes; +		}  	}  	return MovePositionOutsideChar(pos, delta, true);  } @@ -1518,22 +1662,39 @@ int Document::ExtendWordSelect(int pos, int delta, bool onlyWordCharacters) {   * additional movement to transit white space.   * Used by cursor movement by word commands.   */ -int Document::NextWordStart(int pos, int delta) { +int Document::NextWordStart(int pos, int delta) const {  	if (delta < 0) { -		while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == CharClassify::ccSpace)) -			pos--; +		while (pos > 0) { +			const CharacterExtracted ce = CharacterBefore(pos); +			if (WordCharacterClass(ce.character) != CharClassify::ccSpace) +				break; +			pos -= ce.widthBytes; +		}  		if (pos > 0) { -			CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos-1)); -			while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == ccStart)) { -				pos--; +			CharacterExtracted ce = CharacterBefore(pos); +			const CharClassify::cc ccStart = WordCharacterClass(ce.character); +			while (pos > 0) { +				ce = CharacterBefore(pos); +				if (WordCharacterClass(ce.character) != ccStart) +					break; +				pos -= ce.widthBytes;  			}  		}  	} else { -		CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos)); -		while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == ccStart)) -			pos++; -		while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == CharClassify::ccSpace)) -			pos++; +		CharacterExtracted ce = CharacterAfter(pos); +		const CharClassify::cc ccStart = WordCharacterClass(ce.character); +		while (pos < Length()) { +			ce = CharacterAfter(pos); +			if (WordCharacterClass(ce.character) != ccStart) +				break; +			pos += ce.widthBytes; +		} +		while (pos < Length()) { +			ce = CharacterAfter(pos); +			if (WordCharacterClass(ce.character) != CharClassify::ccSpace) +				break; +			pos += ce.widthBytes; +		}  	}  	return pos;  } @@ -1545,27 +1706,41 @@ int Document::NextWordStart(int pos, int delta) {   * additional movement to transit white space.   * Used by cursor movement by word commands.   */ -int Document::NextWordEnd(int pos, int delta) { +int Document::NextWordEnd(int pos, int delta) const {  	if (delta < 0) {  		if (pos > 0) { -			CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos-1)); +			CharacterExtracted ce = CharacterBefore(pos); +			CharClassify::cc ccStart = WordCharacterClass(ce.character);  			if (ccStart != CharClassify::ccSpace) { -				while (pos > 0 && WordCharClass(cb.CharAt(pos - 1)) == ccStart) { -					pos--; +				while (pos > 0) { +					ce = CharacterBefore(pos); +					if (WordCharacterClass(ce.character) != ccStart) +						break; +					pos -= ce.widthBytes;  				}  			} -			while (pos > 0 && WordCharClass(cb.CharAt(pos - 1)) == CharClassify::ccSpace) { -				pos--; +			while (pos > 0) { +				ce = CharacterBefore(pos); +				if (WordCharacterClass(ce.character) != CharClassify::ccSpace) +					break; +				pos -= ce.widthBytes;  			}  		}  	} else { -		while (pos < Length() && WordCharClass(cb.CharAt(pos)) == CharClassify::ccSpace) { -			pos++; +		while (pos < Length()) { +			CharacterExtracted ce = CharacterAfter(pos); +			if (WordCharacterClass(ce.character) != CharClassify::ccSpace) +				break; +			pos += ce.widthBytes;  		}  		if (pos < Length()) { -			CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos)); -			while (pos < Length() && WordCharClass(cb.CharAt(pos)) == ccStart) { -				pos++; +			CharacterExtracted ce = CharacterAfter(pos); +			CharClassify::cc ccStart = WordCharacterClass(ce.character); +			while (pos < Length()) { +				ce = CharacterAfter(pos); +				if (WordCharacterClass(ce.character) != ccStart) +					break; +				pos += ce.widthBytes;  			}  		}  	} @@ -1577,10 +1752,15 @@ int Document::NextWordEnd(int pos, int delta) {   * the previous character is of a different character class.   */  bool Document::IsWordStartAt(int pos) const { +	if (pos >= Length()) +		return false;  	if (pos > 0) { -		CharClassify::cc ccPos = WordCharClass(CharAt(pos)); +		const CharacterExtracted cePos = CharacterAfter(pos); +		const CharClassify::cc ccPos = WordCharacterClass(cePos.character); +		const CharacterExtracted cePrev = CharacterBefore(pos); +		const CharClassify::cc ccPrev = WordCharacterClass(cePrev.character);  		return (ccPos == CharClassify::ccWord || ccPos == CharClassify::ccPunctuation) && -			(ccPos != WordCharClass(CharAt(pos - 1))); +			(ccPos != ccPrev);  	}  	return true;  } @@ -1590,10 +1770,15 @@ bool Document::IsWordStartAt(int pos) const {   * the next character is of a different character class.   */  bool Document::IsWordEndAt(int pos) const { +	if (pos <= 0) +		return false;  	if (pos < Length()) { -		CharClassify::cc ccPrev = WordCharClass(CharAt(pos-1)); +		const CharacterExtracted cePos = CharacterAfter(pos); +		const CharClassify::cc ccPos = WordCharacterClass(cePos.character); +		const CharacterExtracted cePrev = CharacterBefore(pos); +		const CharClassify::cc ccPrev = WordCharacterClass(cePrev.character);  		return (ccPrev == CharClassify::ccWord || ccPrev == CharClassify::ccPunctuation) && -			(ccPrev != WordCharClass(CharAt(pos))); +			(ccPrev != ccPos);  	}  	return true;  } @@ -2075,96 +2260,137 @@ void Document::NotifyModified(DocModification mh) {  	}  } -bool Document::IsWordPartSeparator(char ch) const { -	return (WordCharClass(ch) == CharClassify::ccWord) && IsPunctuation(ch); +// Used for word part navigation. +static bool IsASCIIPunctuationCharacter(unsigned int ch) { +	switch (ch) { +	case '!': +	case '"': +	case '#': +	case '$': +	case '%': +	case '&': +	case '\'': +	case '(': +	case ')': +	case '*': +	case '+': +	case ',': +	case '-': +	case '.': +	case '/': +	case ':': +	case ';': +	case '<': +	case '=': +	case '>': +	case '?': +	case '@': +	case '[': +	case '\\': +	case ']': +	case '^': +	case '_': +	case '`': +	case '{': +	case '|': +	case '}': +	case '~': +		return true; +	default: +		return false; +	} +} + +bool Document::IsWordPartSeparator(unsigned int ch) const { +	return (WordCharacterClass(ch) == CharClassify::ccWord) && IsASCIIPunctuationCharacter(ch);  } -int Document::WordPartLeft(int pos) { +int Document::WordPartLeft(int pos) const {  	if (pos > 0) { -		--pos; -		char startChar = cb.CharAt(pos); -		if (IsWordPartSeparator(startChar)) { -			while (pos > 0 && IsWordPartSeparator(cb.CharAt(pos))) { -				--pos; +		pos -= CharacterBefore(pos).widthBytes; +		CharacterExtracted ceStart = CharacterAfter(pos); +		if (IsWordPartSeparator(ceStart.character)) { +			while (pos > 0 && IsWordPartSeparator(CharacterAfter(pos).character)) { +				pos -= CharacterBefore(pos).widthBytes;  			}  		}  		if (pos > 0) { -			startChar = cb.CharAt(pos); -			--pos; -			if (IsLowerCase(startChar)) { -				while (pos > 0 && IsLowerCase(cb.CharAt(pos))) -					--pos; -				if (!IsUpperCase(cb.CharAt(pos)) && !IsLowerCase(cb.CharAt(pos))) -					++pos; -			} else if (IsUpperCase(startChar)) { -				while (pos > 0 && IsUpperCase(cb.CharAt(pos))) -					--pos; -				if (!IsUpperCase(cb.CharAt(pos))) -					++pos; -			} else if (IsADigit(startChar)) { -				while (pos > 0 && IsADigit(cb.CharAt(pos))) -					--pos; -				if (!IsADigit(cb.CharAt(pos))) -					++pos; -			} else if (IsPunctuation(startChar)) { -				while (pos > 0 && IsPunctuation(cb.CharAt(pos))) -					--pos; -				if (!IsPunctuation(cb.CharAt(pos))) -					++pos; -			} else if (isspacechar(startChar)) { -				while (pos > 0 && isspacechar(cb.CharAt(pos))) -					--pos; -				if (!isspacechar(cb.CharAt(pos))) -					++pos; -			} else if (!IsASCII(startChar)) { -				while (pos > 0 && !IsASCII(cb.CharAt(pos))) -					--pos; -				if (IsASCII(cb.CharAt(pos))) -					++pos; +			ceStart = CharacterAfter(pos); +			pos -= CharacterBefore(pos).widthBytes; +			if (IsLowerCase(ceStart.character)) { +				while (pos > 0 && IsLowerCase(CharacterAfter(pos).character)) +					pos -= CharacterBefore(pos).widthBytes; +				if (!IsUpperCase(CharacterAfter(pos).character) && !IsLowerCase(CharacterAfter(pos).character)) +					pos += CharacterAfter(pos).widthBytes; +			} else if (IsUpperCase(ceStart.character)) { +				while (pos > 0 && IsUpperCase(CharacterAfter(pos).character)) +					pos -= CharacterBefore(pos).widthBytes; +				if (!IsUpperCase(CharacterAfter(pos).character)) +					pos += CharacterAfter(pos).widthBytes; +			} else if (IsADigit(ceStart.character)) { +				while (pos > 0 && IsADigit(CharacterAfter(pos).character)) +					pos -= CharacterBefore(pos).widthBytes; +				if (!IsADigit(CharacterAfter(pos).character)) +					pos += CharacterAfter(pos).widthBytes; +			} else if (IsASCIIPunctuationCharacter(ceStart.character)) { +				while (pos > 0 && IsASCIIPunctuationCharacter(CharacterAfter(pos).character)) +					pos -= CharacterBefore(pos).widthBytes; +				if (!IsASCIIPunctuationCharacter(CharacterAfter(pos).character)) +					pos += CharacterAfter(pos).widthBytes; +			} else if (isspacechar(ceStart.character)) { +				while (pos > 0 && isspacechar(CharacterAfter(pos).character)) +					pos -= CharacterBefore(pos).widthBytes; +				if (!isspacechar(CharacterAfter(pos).character)) +					pos += CharacterAfter(pos).widthBytes; +			} else if (!IsASCII(ceStart.character)) { +				while (pos > 0 && !IsASCII(CharacterAfter(pos).character)) +					pos -= CharacterBefore(pos).widthBytes; +				if (IsASCII(CharacterAfter(pos).character)) +					pos += CharacterAfter(pos).widthBytes;  			} else { -				++pos; +				pos += CharacterAfter(pos).widthBytes;  			}  		}  	}  	return pos;  } -int Document::WordPartRight(int pos) { -	char startChar = cb.CharAt(pos); -	int length = Length(); -	if (IsWordPartSeparator(startChar)) { -		while (pos < length && IsWordPartSeparator(cb.CharAt(pos))) -			++pos; -		startChar = cb.CharAt(pos); -	} -	if (!IsASCII(startChar)) { -		while (pos < length && !IsASCII(cb.CharAt(pos))) -			++pos; -	} else if (IsLowerCase(startChar)) { -		while (pos < length && IsLowerCase(cb.CharAt(pos))) -			++pos; -	} else if (IsUpperCase(startChar)) { -		if (IsLowerCase(cb.CharAt(pos + 1))) { -			++pos; -			while (pos < length && IsLowerCase(cb.CharAt(pos))) -				++pos; +int Document::WordPartRight(int pos) const { +	CharacterExtracted ceStart = CharacterAfter(pos); +	const int length = Length(); +	if (IsWordPartSeparator(ceStart.character)) { +		while (pos < length && IsWordPartSeparator(CharacterAfter(pos).character)) +			pos += CharacterAfter(pos).widthBytes; +		ceStart = CharacterAfter(pos); +	} +	if (!IsASCII(ceStart.character)) { +		while (pos < length && !IsASCII(CharacterAfter(pos).character)) +			pos += CharacterAfter(pos).widthBytes; +	} else if (IsLowerCase(ceStart.character)) { +		while (pos < length && IsLowerCase(CharacterAfter(pos).character)) +			pos += CharacterAfter(pos).widthBytes; +	} else if (IsUpperCase(ceStart.character)) { +		if (IsLowerCase(CharacterAfter(pos + ceStart.widthBytes).character)) { +			pos += CharacterAfter(pos).widthBytes; +			while (pos < length && IsLowerCase(CharacterAfter(pos).character)) +				pos += CharacterAfter(pos).widthBytes;  		} else { -			while (pos < length && IsUpperCase(cb.CharAt(pos))) -				++pos; -		} -		if (IsLowerCase(cb.CharAt(pos)) && IsUpperCase(cb.CharAt(pos - 1))) -			--pos; -	} else if (IsADigit(startChar)) { -		while (pos < length && IsADigit(cb.CharAt(pos))) -			++pos; -	} else if (IsPunctuation(startChar)) { -		while (pos < length && IsPunctuation(cb.CharAt(pos))) -			++pos; -	} else if (isspacechar(startChar)) { -		while (pos < length && isspacechar(cb.CharAt(pos))) -			++pos; +			while (pos < length && IsUpperCase(CharacterAfter(pos).character)) +				pos += CharacterAfter(pos).widthBytes; +		} +		if (IsLowerCase(CharacterAfter(pos).character) && IsUpperCase(CharacterBefore(pos).character)) +			pos -= CharacterBefore(pos).widthBytes; +	} else if (IsADigit(ceStart.character)) { +		while (pos < length && IsADigit(CharacterAfter(pos).character)) +			pos += CharacterAfter(pos).widthBytes; +	} else if (IsASCIIPunctuationCharacter(ceStart.character)) { +		while (pos < length && IsASCIIPunctuationCharacter(CharacterAfter(pos).character)) +			pos += CharacterAfter(pos).widthBytes; +	} else if (isspacechar(ceStart.character)) { +		while (pos < length && isspacechar(CharacterAfter(pos).character)) +			pos += CharacterAfter(pos).widthBytes;  	} else { -		++pos; +		pos += CharacterAfter(pos).widthBytes;  	}  	return pos;  } | 
