diff options
author | Neil <nyamatongwe@gmail.com> | 2017-03-23 17:49:50 +1100 |
---|---|---|
committer | Neil <nyamatongwe@gmail.com> | 2017-03-23 17:49:50 +1100 |
commit | 40468d40088ea0657d8046747a69c9e99821a403 (patch) | |
tree | 90461250a824f81b38a19aac3176b8911b422d9a /lexlib/CharacterCategory.cxx | |
parent | 54012e88adcb0fa83afdd1626aab55066c2e8247 (diff) | |
download | scintilla-mirror-40468d40088ea0657d8046747a69c9e99821a403.tar.gz |
The Python lexer recognizes identifiers more accurately when they include
non-ASCII characters.
Calls provided for determining whether characters are in the sets defined for
identifiers by the Unicode standard in UAX #31.
Diffstat (limited to 'lexlib/CharacterCategory.cxx')
-rw-r--r-- | lexlib/CharacterCategory.cxx | 146 |
1 files changed, 146 insertions, 0 deletions
diff --git a/lexlib/CharacterCategory.cxx b/lexlib/CharacterCategory.cxx index 7ca3a0f27..0880171e8 100644 --- a/lexlib/CharacterCategory.cxx +++ b/lexlib/CharacterCategory.cxx @@ -3704,6 +3704,152 @@ CharacterCategory CategoriseCharacter(int character) { return static_cast<CharacterCategory>(*(placeAfter-1) & maskCategory); } +// Implementation of character sets recommended for identifiers in Unicode Standard Annex #31. +// http://unicode.org/reports/tr31/ + +namespace { + +enum class OtherID { oidNone, oidStart, oidContinue }; + +// Some characters are treated as valid for identifiers even +// though most characters from their category are not. +// Values copied from http://www.unicode.org/Public/9.0.0/ucd/PropList.txt +OtherID OtherIDOfCharacter(int character) { + if ( + (character == 0x1885) || // MONGOLIAN LETTER ALI GALI BALUDA + (character == 0x1886) || // MONGOLIAN LETTER ALI GALI THREE BALUDA + (character == 0x2118) || // SCRIPT CAPITAL P + (character == 0x212E) || // ESTIMATED SYMBOL + (character == 0x309B) || // KATAKANA-HIRAGANA VOICED SOUND MARK + (character == 0x309C)) { // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK + return OtherID::oidStart; + } else if ( + (character == 0x00B7) || // MIDDLE DOT + (character == 0x0387) || // GREEK ANO TELEIA + ((character >= 0x1369) && (character <= 0x1371)) || // ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE + (character == 0x19DA)) { // NEW TAI LUE THAM DIGIT ONE + return OtherID::oidContinue; + } else { + return OtherID::oidNone; + } +} + +// Determine if a character is in Ll|Lu|Lt|Lm|Lo|Nl|Mn|Mc|Nd|Pc and has +// Pattern_Syntax|Pattern_White_Space. +// As of Unicode 9, only VERTICAL TILDE which is in Lm and has Pattern_Syntax matches. +// Should really generate from PropList.txt a list of Pattern_Syntax and Pattern_White_Space. +bool IsIdPattern(int character) { + return character == 0x2E2F; +} + +bool OmitXidStart(int character) { + switch (character) { + case 0x037A: // GREEK YPOGEGRAMMENI + case 0x0E33: // THAI CHARACTER SARA AM + case 0x0EB3: // LAO VOWEL SIGN AM + case 0x309B: // KATAKANA-HIRAGANA VOICED SOUND MARK + case 0x309C: // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK + case 0xFC5E: // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM + case 0xFC5F: // ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM + case 0xFC60: // ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM + case 0xFC61: // ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM + case 0xFC62: // ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM + case 0xFC63: // ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM + case 0xFDFA: // ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM + case 0xFDFB: // ARABIC LIGATURE JALLAJALALOUHOU + case 0xFE70: // ARABIC FATHATAN ISOLATED FORM + case 0xFE72: // ARABIC DAMMATAN ISOLATED FORM + case 0xFE74: // ARABIC KASRATAN ISOLATED FORM + case 0xFE76: // ARABIC FATHA ISOLATED FORM + case 0xFE78: // ARABIC DAMMA ISOLATED FORM + case 0xFE7A: // ARABIC KASRA ISOLATED FORM + case 0xFE7C: // ARABIC SHADDA ISOLATED FORM + case 0xFE7E: // ARABIC SUKUN ISOLATED FORM + case 0xFF9E: // HALFWIDTH KATAKANA VOICED SOUND MARK + case 0xFF9F: // HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK + return true; + default: + return false; + } +} + +bool OmitXidContinue(int character) { + switch (character) { + case 0x037A: // GREEK YPOGEGRAMMENI + case 0x309B: // KATAKANA-HIRAGANA VOICED SOUND MARK + case 0x309C: // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK + case 0xFC5E: // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM + case 0xFC5F: // ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM + case 0xFC60: // ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM + case 0xFC61: // ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM + case 0xFC62: // ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM + case 0xFC63: // ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM + case 0xFDFA: // ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM + case 0xFDFB: // ARABIC LIGATURE JALLAJALALOUHOU + case 0xFE70: // ARABIC FATHATAN ISOLATED FORM + case 0xFE72: // ARABIC DAMMATAN ISOLATED FORM + case 0xFE74: // ARABIC KASRATAN ISOLATED FORM + case 0xFE76: // ARABIC FATHA ISOLATED FORM + case 0xFE78: // ARABIC DAMMA ISOLATED FORM + case 0xFE7A: // ARABIC KASRA ISOLATED FORM + case 0xFE7C: // ARABIC SHADDA ISOLATED FORM + case 0xFE7E: // ARABIC SUKUN ISOLATED FORM + return true; + default: + return false; + } +} + +} + +// UAX #31 defines ID_Start as +// [[:L:][:Nl:][:Other_ID_Start:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]] +bool IsIdStart(int character) { + if (IsIdPattern(character)) { + return false; + } + OtherID oid = OtherIDOfCharacter(character); + if (oid == OtherID::oidStart) { + return true; + } + CharacterCategory c = CategoriseCharacter(character); + return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo + || c == ccNl); +} + +// UAX #31 defines ID_Continue as +// [[:ID_Start:][:Mn:][:Mc:][:Nd:][:Pc:][:Other_ID_Continue:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]] +bool IsIdContinue(int character) { + if (IsIdPattern(character)) { + return false; + } + OtherID oid = OtherIDOfCharacter(character); + if (oid != OtherID::oidNone) { + return true; + } + CharacterCategory c = CategoriseCharacter(character); + return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo + || c == ccNl || c == ccMn || c == ccMc || c == ccNd || c == ccPc); +} + +// XID_Start is ID_Start modified for Normalization Form KC in UAX #31 +bool IsXidStart(int character) { + if (OmitXidStart(character)) { + return false; + } else { + return IsIdStart(character); + } +} + +// XID_Continue is ID_Continue modified for Normalization Form KC in UAX #31 +bool IsXidContinue(int character) { + if (OmitXidContinue(character)) { + return false; + } else { + return IsIdContinue(character); + } +} + #ifdef SCI_NAMESPACE } #endif |