diff options
| author | Neil <nyamatongwe@gmail.com> | 2022-11-12 20:37:31 +1100 | 
|---|---|---|
| committer | Neil <nyamatongwe@gmail.com> | 2022-11-12 20:37:31 +1100 | 
| commit | 386b3dc9ddb38992ebc1c05b034b3dd2d8dcc2d9 (patch) | |
| tree | 106f4ad4308aabef8de78b2121698e4a87f118f3 | |
| parent | 5ae14dd681c7f78af6d184286f3c2a94dac9a40b (diff) | |
| download | scintilla-mirror-386b3dc9ddb38992ebc1c05b034b3dd2d8dcc2d9.tar.gz | |
Hoist common conversion from UTF-8 byte string into CharacterExtracted
constructor.
Move CharacterExtracted out of Document so it can be more widely used.
| -rw-r--r-- | src/Document.cxx | 44 | ||||
| -rw-r--r-- | src/Document.h | 39 | 
2 files changed, 44 insertions, 39 deletions
| diff --git a/src/Document.cxx b/src/Document.cxx index 10c8e9ce5..87cace721 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -125,6 +125,18 @@ size_t ActionDuration::ActionsInAllowedTime(double secondsAllowed) const noexcep  	return std::lround(secondsAllowed / Duration());  } +CharacterExtracted::CharacterExtracted(const unsigned char *charBytes, size_t widthCharBytes) noexcept { +	const int utf8status = UTF8Classify(charBytes, widthCharBytes); +	if (utf8status & UTF8MaskInvalid) { +		// Treat as invalid and use up just one byte +		character = unicodeReplacementChar; +		widthBytes = 1; +	} else { +		character = UnicodeFromUTF8(charBytes); +		widthBytes = utf8status & UTF8MaskWidth; +	} +} +  Document::Document(DocumentOption options) :  	cb(!FlagSet(options, DocumentOption::StylesNone), FlagSet(options, DocumentOption::TextLarge)),  	durationStyleOneByte(0.000001, 0.0000001, 0.00001) { @@ -917,7 +929,7 @@ bool Document::NextCharacter(Sci::Position &pos, int moveDir) const noexcept {  	}  } -Document::CharacterExtracted Document::CharacterAfter(Sci::Position position) const noexcept { +CharacterExtracted Document::CharacterAfter(Sci::Position position) const noexcept {  	if (position >= LengthNoExcept()) {  		return CharacterExtracted(unicodeReplacementChar, 0);  	} @@ -931,13 +943,7 @@ Document::CharacterExtracted Document::CharacterAfter(Sci::Position position) co  		unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 };  		for (int b = 1; b<widthCharBytes; b++)  			charBytes[b] = cb.UCharAt(position + b); -		const int utf8status = UTF8Classify(charBytes, widthCharBytes); -		if (utf8status & UTF8MaskInvalid) { -			// Treat as invalid and use up just one byte -			return CharacterExtracted(unicodeReplacementChar, 1); -		} else { -			return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth); -		} +		return CharacterExtracted(charBytes, widthCharBytes);  	} else {  		if (IsDBCSLeadByteNoExcept(leadByte)) {  			const unsigned char trailByte = cb.UCharAt(position + 1); @@ -949,7 +955,7 @@ Document::CharacterExtracted Document::CharacterAfter(Sci::Position position) co  	}  } -Document::CharacterExtracted Document::CharacterBefore(Sci::Position position) const noexcept { +CharacterExtracted Document::CharacterBefore(Sci::Position position) const noexcept {  	if (position <= 0) {  		return CharacterExtracted(unicodeReplacementChar, 0);  	} @@ -972,13 +978,7 @@ Document::CharacterExtracted Document::CharacterBefore(Sci::Position position) c  				unsigned char charBytes[UTF8MaxBytes] = { 0, 0, 0, 0 };  				for (Sci::Position b = 0; b<widthCharBytes; b++)  					charBytes[b] = cb.UCharAt(startUTF + b); -				const int utf8status = UTF8Classify(charBytes, widthCharBytes); -				if (utf8status & UTF8MaskInvalid) { -					// Treat as invalid and use up just one byte -					return CharacterExtracted(unicodeReplacementChar, 1); -				} else { -					return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth); -				} +				return CharacterExtracted(charBytes, widthCharBytes);  			}  			// Else invalid UTF-8 so return position of isolated trail byte  		} @@ -2037,7 +2037,7 @@ void Document::SetCaseFolder(std::unique_ptr<CaseFolder> pcf_) noexcept {  	pcf = std::move(pcf_);  } -Document::CharacterExtracted Document::ExtractCharacter(Sci::Position position) const noexcept { +CharacterExtracted Document::ExtractCharacter(Sci::Position position) const noexcept {  	const unsigned char leadByte = cb.UCharAt(position);  	if (UTF8IsAscii(leadByte)) {  		// Common case: ASCII character @@ -2047,13 +2047,7 @@ Document::CharacterExtracted Document::ExtractCharacter(Sci::Position position)  	unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 };  	for (int b=1; b<widthCharBytes; b++)  		charBytes[b] = cb.UCharAt(position + b); -	const int utf8status = UTF8Classify(charBytes, widthCharBytes); -	if (utf8status & UTF8MaskInvalid) { -		// Treat as invalid and use up just one byte -		return CharacterExtracted(unicodeReplacementChar, 1); -	} else { -		return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth); -	} +	return CharacterExtracted(charBytes, widthCharBytes);  }  namespace { @@ -3040,7 +3034,7 @@ public:  	}  private:  	void ReadCharacter() noexcept { -		const Document::CharacterExtracted charExtracted = doc->ExtractCharacter(position); +		const CharacterExtracted charExtracted = doc->ExtractCharacter(position);  		lenBytes = charExtracted.widthBytes;  		if (charExtracted.character == unicodeReplacementChar) {  			lenCharacters = 1; diff --git a/src/Document.h b/src/Document.h index 1cda253e1..ae784180a 100644 --- a/src/Document.h +++ b/src/Document.h @@ -227,6 +227,29 @@ public:  };  /** + * A whole character (code point) with a value and width in bytes. + * For UTF-8, the value is the code point value. + * For DBCS, its jamming the lead and trail bytes together. + * For 8 bit encodings, is just the byte value. + */ +struct CharacterExtracted { +	unsigned int character; +	unsigned int widthBytes; + +	CharacterExtracted(unsigned int character_, unsigned int widthBytes_) noexcept : +		character(character_), widthBytes(widthBytes_) { +	} + +	// For UTF-8: +	CharacterExtracted(const unsigned char *charBytes, size_t widthCharBytes) noexcept; + +	// For DBCS characters turn 2 bytes into an int +	static CharacterExtracted DBCS(unsigned char lead, unsigned char trail) noexcept { +		return CharacterExtracted((lead << 8) | trail, 2); +	} +}; + +/**   */  class Document : PerLine, public Scintilla::IDocument, public Scintilla::ILoader { @@ -276,18 +299,6 @@ private:  public: -	struct CharacterExtracted { -		unsigned int character; -		unsigned int widthBytes; -		CharacterExtracted(unsigned int character_, unsigned int widthBytes_) noexcept : -			character(character_), widthBytes(widthBytes_) { -		} -		// For DBCS characters turn 2 bytes into an int -		static CharacterExtracted DBCS(unsigned char lead, unsigned char trail) noexcept { -			return CharacterExtracted((lead << 8) | trail, 2); -		} -	}; -  	Scintilla::EndOfLine eolMode;  	/// Can also be SC_CP_UTF8 to enable UTF-8 mode  	int dbcsCodePage; @@ -341,8 +352,8 @@ public:  	Sci::Position MovePositionOutsideChar(Sci::Position pos, Sci::Position moveDir, bool checkLineEnd=true) const noexcept;  	Sci::Position NextPosition(Sci::Position pos, int moveDir) const noexcept;  	bool NextCharacter(Sci::Position &pos, int moveDir) const noexcept;	// Returns true if pos changed -	Document::CharacterExtracted CharacterAfter(Sci::Position position) const noexcept; -	Document::CharacterExtracted CharacterBefore(Sci::Position position) const noexcept; +	CharacterExtracted CharacterAfter(Sci::Position position) const noexcept; +	CharacterExtracted CharacterBefore(Sci::Position position) const noexcept;  	Sci_Position SCI_METHOD GetRelativePosition(Sci_Position positionStart, Sci_Position characterOffset) const override;  	Sci::Position GetRelativePositionUTF16(Sci::Position positionStart, Sci::Position characterOffset) const noexcept;  	int SCI_METHOD GetCharacterAndWidth(Sci_Position position, Sci_Position *pWidth) const override; | 
