diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/Document.cxx | 119 | 
1 files changed, 61 insertions, 58 deletions
| diff --git a/src/Document.cxx b/src/Document.cxx index 4e9366064..244e96e4f 100644 --- a/src/Document.cxx +++ b/src/Document.cxx @@ -465,51 +465,29 @@ int Document::LenChar(int pos) {  	}  } -static inline bool IsTrailByte(int ch) { -	return (ch >= 0x80) && (ch < 0xc0); -} - -static int BytesFromLead(int leadByte) { -	if (leadByte > 0xF4) { -		// Characters longer than 4 bytes not possible in current UTF-8 -		return 0; -	} else if (leadByte >= 0xF0) { -		return 4; -	} else if (leadByte >= 0xE0) { -		return 3; -	} else if (leadByte >= 0xC2) { -		return 2; -	} -	return 0; -} -  bool Document::InGoodUTF8(int pos, int &start, int &end) const { -	int lead = pos; -	while ((lead>0) && (pos-lead < 4) && IsTrailByte(static_cast<unsigned char>(cb.CharAt(lead-1)))) -		lead--; -	start = 0; -	if (lead > 0) { -		start = lead-1; -	} -	int leadByte = static_cast<unsigned char>(cb.CharAt(start)); -	int bytes = BytesFromLead(leadByte); -	if (bytes == 0) { +	int trail = pos; +	while ((trail>0) && (pos-trail < UTF8MaxBytes) && UTF8IsTrailByte(static_cast<unsigned char>(cb.CharAt(trail-1)))) +		trail--; +	start = (trail > 0) ? trail-1 : trail; + +	const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(start)); +	const int widthCharBytes = UTF8BytesOfLead[leadByte]; +	if (widthCharBytes == 1) {  		return false;  	} else { -		int trailBytes = bytes - 1; -		int len = pos - lead + 1; +		int trailBytes = widthCharBytes - 1; +		int len = pos - start;  		if (len > trailBytes)  			// pos too far from lead  			return false; -		// Check that there are enough trails for this lead -		int trail = pos + 1; -		while ((trail-lead<trailBytes) && (trail < Length())) { -			if (!IsTrailByte(static_cast<unsigned char>(cb.CharAt(trail)))) { -				return false; -			} -			trail++; -		} -		end = start + bytes; +		char charBytes[UTF8MaxBytes] = {static_cast<char>(leadByte),0,0,0}; +		for (int b=1; b<widthCharBytes && ((start+b) < Length()); b++) +			charBytes[b] = cb.CharAt(static_cast<int>(start+b)); +		int utf8status = UTF8Classify(reinterpret_cast<const unsigned char *>(charBytes), widthCharBytes); +		if (utf8status & UTF8MaskInvalid) +			return false; +		end = start + widthCharBytes;  		return true;  	}  } @@ -538,14 +516,18 @@ int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) {  	if (dbcsCodePage) {  		if (SC_CP_UTF8 == dbcsCodePage) {  			unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos)); -			int startUTF = pos; -			int endUTF = pos; -			if (IsTrailByte(ch) && InGoodUTF8(pos, startUTF, endUTF)) { -				// ch is a trail byte within a UTF-8 character -				if (moveDir > 0) -					pos = endUTF; -				else -					pos = startUTF; +			// If ch is not a trail byte then pos is valid intercharacter position +			if (UTF8IsTrailByte(ch)) { +				int startUTF = pos; +				int endUTF = pos; +				if (InGoodUTF8(pos, startUTF, endUTF)) { +					// ch is a trail byte within a UTF-8 character +					if (moveDir > 0) +						pos = endUTF; +					else +						pos = startUTF; +				} +				// Else invalid UTF-8 so return position of isolated trail byte  			}  		} else {  			// Anchor DBCS calculations at start of line because start of line can @@ -592,16 +574,37 @@ int Document::NextPosition(int pos, int moveDir) const {  	if (dbcsCodePage) {  		if (SC_CP_UTF8 == dbcsCodePage) { -			pos += increment; -			unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos)); -			int startUTF = pos; -			int endUTF = pos; -			if (IsTrailByte(ch) && InGoodUTF8(pos, startUTF, endUTF)) { -				// ch is a trail byte within a UTF-8 character -				if (moveDir > 0) -					pos = endUTF; -				else -					pos = startUTF; +			if (increment == 1) { +				// Simple forward movement case so can avoid some checks +				const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(pos)); +				if (UTF8IsAscii(leadByte)) { +					// Single byte character or invalid +					pos++; +				} else { +					const int widthCharBytes = UTF8BytesOfLead[leadByte]; +					char charBytes[UTF8MaxBytes] = {static_cast<char>(leadByte),0,0,0}; +					for (int b=1; b<widthCharBytes; b++) +						charBytes[b] = cb.CharAt(static_cast<int>(pos+b)); +					int utf8status = UTF8Classify(reinterpret_cast<const unsigned char *>(charBytes), widthCharBytes); +					if (utf8status & UTF8MaskInvalid) +						pos++; +					else +						pos += utf8status & UTF8MaskWidth; +				} +			} else { +				// Examine byte before position +				pos--; +				unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos)); +				// If ch is not a trail byte then pos is valid intercharacter position +				if (UTF8IsTrailByte(ch)) { +					// If ch is a trail byte in a valid UTF-8 character then return start of character +					int startUTF = pos; +					int endUTF = pos; +					if (InGoodUTF8(pos, startUTF, endUTF)) { +						pos = startUTF; +					} +					// Else invalid UTF-8 so return position of isolated trail byte +				}  			}  		} else {  			if (moveDir > 0) { @@ -1246,7 +1249,7 @@ int Document::ParaDown(int pos) {  }  CharClassify::cc Document::WordCharClass(unsigned char ch) { -	if ((SC_CP_UTF8 == dbcsCodePage) && (ch >= 0x80)) +	if ((SC_CP_UTF8 == dbcsCodePage) && (!UTF8IsAscii(ch)))  		return CharClassify::ccWord;  	return charClass.GetClass(ch);  } | 
