diff options
| -rw-r--r-- | doc/ScintillaHistory.html | 3 | ||||
| -rw-r--r-- | lexers/LexPython.cxx | 14 | ||||
| -rw-r--r-- | lexlib/CharacterCategory.cxx | 146 | ||||
| -rw-r--r-- | lexlib/CharacterCategory.h | 6 | 
4 files changed, 159 insertions, 10 deletions
| diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html index 6dc53e548..b3e4e1d2c 100644 --- a/doc/ScintillaHistory.html +++ b/doc/ScintillaHistory.html @@ -529,6 +529,9 @@  	<li>  	Updated case conversion and character categories to Unicode 9.  	</li> +	<li> +	The Python lexer recognizes identifiers more accurately when they include non-ASCII characters. +	</li>      </ul>      <h3>         <a href="http://www.scintilla.org/scite374.zip">Release 3.7.4</a> diff --git a/lexers/LexPython.cxx b/lexers/LexPython.cxx index f667cb32c..62ed83c95 100644 --- a/lexers/LexPython.cxx +++ b/lexers/LexPython.cxx @@ -192,11 +192,8 @@ inline bool IsAWordChar(int ch, bool unicodeIdentifiers) {  	if (!unicodeIdentifiers)  		return false; -	// Approximation, Python uses the XID_Continue set from unicode data -	// see http://unicode.org/reports/tr31/ -	CharacterCategory c = CategoriseCharacter(ch); -	return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo -		|| c == ccNl || c == ccMn || c == ccMc || c == ccNd || c == ccPc); +	// Python uses the XID_Continue set from unicode data +	return IsXidContinue(ch);  }  inline bool IsAWordStart(int ch, bool unicodeIdentifiers) { @@ -206,11 +203,8 @@ inline bool IsAWordStart(int ch, bool unicodeIdentifiers) {  	if (!unicodeIdentifiers)  		return false; -	// Approximation, Python uses the XID_Start set from unicode data -	// see http://unicode.org/reports/tr31/ -	CharacterCategory c = CategoriseCharacter(ch); -	return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo -		|| c == ccNl); +	// Python uses the XID_Start set from unicode data +	return IsXidStart(ch);  }  static bool IsFirstNonWhitespace(Sci_Position pos, Accessor &styler) { diff --git a/lexlib/CharacterCategory.cxx b/lexlib/CharacterCategory.cxx index 7ca3a0f27..0880171e8 100644 --- a/lexlib/CharacterCategory.cxx +++ b/lexlib/CharacterCategory.cxx @@ -3704,6 +3704,152 @@ CharacterCategory CategoriseCharacter(int character) {  	return static_cast<CharacterCategory>(*(placeAfter-1) & maskCategory);  } +// Implementation of character sets recommended for identifiers in Unicode Standard Annex #31. +// http://unicode.org/reports/tr31/ + +namespace { + +enum class OtherID { oidNone, oidStart, oidContinue }; + +// Some characters are treated as valid for identifiers even +// though most characters from their category are not. +// Values copied from http://www.unicode.org/Public/9.0.0/ucd/PropList.txt +OtherID OtherIDOfCharacter(int character) { +	if ( +		(character == 0x1885) ||	// MONGOLIAN LETTER ALI GALI BALUDA +		(character == 0x1886) ||	// MONGOLIAN LETTER ALI GALI THREE BALUDA +		(character == 0x2118) ||	// SCRIPT CAPITAL P +		(character == 0x212E) ||	// ESTIMATED SYMBOL +		(character == 0x309B) ||	// KATAKANA-HIRAGANA VOICED SOUND MARK +		(character == 0x309C)) {	// KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK +		return OtherID::oidStart; +	} else if ( +		(character == 0x00B7) ||	// MIDDLE DOT +		(character == 0x0387) ||	// GREEK ANO TELEIA +		((character >= 0x1369) && (character <= 0x1371)) ||	// ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE +		(character == 0x19DA)) {	// NEW TAI LUE THAM DIGIT ONE +		return OtherID::oidContinue; +	} else { +		return OtherID::oidNone; +	} +} + +// Determine if a character is in  Ll|Lu|Lt|Lm|Lo|Nl|Mn|Mc|Nd|Pc and has +// Pattern_Syntax|Pattern_White_Space. +// As of Unicode 9, only VERTICAL TILDE which is in Lm and has Pattern_Syntax matches. +// Should really generate from PropList.txt a list of Pattern_Syntax and Pattern_White_Space. +bool IsIdPattern(int character) { +	return character == 0x2E2F; +} + +bool OmitXidStart(int character) { +	switch (character) { +	case 0x037A:	// GREEK YPOGEGRAMMENI +	case 0x0E33:	// THAI CHARACTER SARA AM +	case 0x0EB3:	// LAO VOWEL SIGN AM +	case 0x309B:	// KATAKANA-HIRAGANA VOICED SOUND MARK +	case 0x309C:	// KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK +	case 0xFC5E:	// ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM +	case 0xFC5F:	// ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM +	case 0xFC60:	// ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM +	case 0xFC61:	// ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM +	case 0xFC62:	// ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM +	case 0xFC63:	// ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM +	case 0xFDFA:	// ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM +	case 0xFDFB:	// ARABIC LIGATURE JALLAJALALOUHOU +	case 0xFE70:	// ARABIC FATHATAN ISOLATED FORM +	case 0xFE72:	// ARABIC DAMMATAN ISOLATED FORM +	case 0xFE74:	// ARABIC KASRATAN ISOLATED FORM +	case 0xFE76:	// ARABIC FATHA ISOLATED FORM +	case 0xFE78:	// ARABIC DAMMA ISOLATED FORM +	case 0xFE7A:	// ARABIC KASRA ISOLATED FORM +	case 0xFE7C:	// ARABIC SHADDA ISOLATED FORM +	case 0xFE7E:	// ARABIC SUKUN ISOLATED FORM +	case 0xFF9E:	// HALFWIDTH KATAKANA VOICED SOUND MARK +	case 0xFF9F:	// HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK +		return true; +	default: +		return false; +	} +} + +bool OmitXidContinue(int character) { +	switch (character) { +	case 0x037A:	// GREEK YPOGEGRAMMENI +	case 0x309B:	// KATAKANA-HIRAGANA VOICED SOUND MARK +	case 0x309C:	// KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK +	case 0xFC5E:	// ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM +	case 0xFC5F:	// ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM +	case 0xFC60:	// ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM +	case 0xFC61:	// ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM +	case 0xFC62:	// ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM +	case 0xFC63:	// ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM +	case 0xFDFA:	// ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM +	case 0xFDFB:	// ARABIC LIGATURE JALLAJALALOUHOU +	case 0xFE70:	// ARABIC FATHATAN ISOLATED FORM +	case 0xFE72:	// ARABIC DAMMATAN ISOLATED FORM +	case 0xFE74:	// ARABIC KASRATAN ISOLATED FORM +	case 0xFE76:	// ARABIC FATHA ISOLATED FORM +	case 0xFE78:	// ARABIC DAMMA ISOLATED FORM +	case 0xFE7A:	// ARABIC KASRA ISOLATED FORM +	case 0xFE7C:	// ARABIC SHADDA ISOLATED FORM +	case 0xFE7E:	// ARABIC SUKUN ISOLATED FORM +		return true; +	default: +		return false; +	} +} + +} + +// UAX #31 defines ID_Start as +// [[:L:][:Nl:][:Other_ID_Start:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]] +bool IsIdStart(int character) { +	if (IsIdPattern(character)) { +		return false; +	} +	OtherID oid = OtherIDOfCharacter(character); +	if (oid == OtherID::oidStart) { +		return true; +	} +	CharacterCategory c = CategoriseCharacter(character); +	return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo +		|| c == ccNl); +} + +// UAX #31 defines ID_Continue as +// [[:ID_Start:][:Mn:][:Mc:][:Nd:][:Pc:][:Other_ID_Continue:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]] +bool IsIdContinue(int character) { +	if (IsIdPattern(character)) { +		return false; +	} +	OtherID oid = OtherIDOfCharacter(character); +	if (oid != OtherID::oidNone) { +		return true; +	} +	CharacterCategory c = CategoriseCharacter(character); +	return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo +		|| c == ccNl || c == ccMn || c == ccMc || c == ccNd || c == ccPc); +} + +// XID_Start is ID_Start modified for Normalization Form KC in UAX #31 +bool IsXidStart(int character) { +	if (OmitXidStart(character)) { +		return false; +	} else { +		return IsIdStart(character); +	} +} + +// XID_Continue is ID_Continue modified for Normalization Form KC in UAX #31 +bool IsXidContinue(int character) { +	if (OmitXidContinue(character)) { +		return false; +	} else { +		return IsIdContinue(character); +	} +} +  #ifdef SCI_NAMESPACE  }  #endif diff --git a/lexlib/CharacterCategory.h b/lexlib/CharacterCategory.h index c8600504b..05ea4187b 100644 --- a/lexlib/CharacterCategory.h +++ b/lexlib/CharacterCategory.h @@ -24,6 +24,12 @@ enum CharacterCategory {  CharacterCategory CategoriseCharacter(int character); +// Common definitions of allowable characters in identifiers from UAX #31. +bool IsIdStart(int character); +bool IsIdContinue(int character); +bool IsXidStart(int character); +bool IsXidContinue(int character); +  #ifdef SCI_NAMESPACE  }  #endif | 
