aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorNeil <nyamatongwe@gmail.com>2017-03-23 17:49:50 +1100
committerNeil <nyamatongwe@gmail.com>2017-03-23 17:49:50 +1100
commit40468d40088ea0657d8046747a69c9e99821a403 (patch)
tree90461250a824f81b38a19aac3176b8911b422d9a
parent54012e88adcb0fa83afdd1626aab55066c2e8247 (diff)
downloadscintilla-mirror-40468d40088ea0657d8046747a69c9e99821a403.tar.gz
The Python lexer recognizes identifiers more accurately when they include
non-ASCII characters. Calls provided for determining whether characters are in the sets defined for identifiers by the Unicode standard in UAX #31.
-rw-r--r--doc/ScintillaHistory.html3
-rw-r--r--lexers/LexPython.cxx14
-rw-r--r--lexlib/CharacterCategory.cxx146
-rw-r--r--lexlib/CharacterCategory.h6
4 files changed, 159 insertions, 10 deletions
diff --git a/doc/ScintillaHistory.html b/doc/ScintillaHistory.html
index 6dc53e548..b3e4e1d2c 100644
--- a/doc/ScintillaHistory.html
+++ b/doc/ScintillaHistory.html
@@ -529,6 +529,9 @@
<li>
Updated case conversion and character categories to Unicode 9.
</li>
+ <li>
+ The Python lexer recognizes identifiers more accurately when they include non-ASCII characters.
+ </li>
</ul>
<h3>
<a href="http://www.scintilla.org/scite374.zip">Release 3.7.4</a>
diff --git a/lexers/LexPython.cxx b/lexers/LexPython.cxx
index f667cb32c..62ed83c95 100644
--- a/lexers/LexPython.cxx
+++ b/lexers/LexPython.cxx
@@ -192,11 +192,8 @@ inline bool IsAWordChar(int ch, bool unicodeIdentifiers) {
if (!unicodeIdentifiers)
return false;
- // Approximation, Python uses the XID_Continue set from unicode data
- // see http://unicode.org/reports/tr31/
- CharacterCategory c = CategoriseCharacter(ch);
- return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo
- || c == ccNl || c == ccMn || c == ccMc || c == ccNd || c == ccPc);
+ // Python uses the XID_Continue set from unicode data
+ return IsXidContinue(ch);
}
inline bool IsAWordStart(int ch, bool unicodeIdentifiers) {
@@ -206,11 +203,8 @@ inline bool IsAWordStart(int ch, bool unicodeIdentifiers) {
if (!unicodeIdentifiers)
return false;
- // Approximation, Python uses the XID_Start set from unicode data
- // see http://unicode.org/reports/tr31/
- CharacterCategory c = CategoriseCharacter(ch);
- return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo
- || c == ccNl);
+ // Python uses the XID_Start set from unicode data
+ return IsXidStart(ch);
}
static bool IsFirstNonWhitespace(Sci_Position pos, Accessor &styler) {
diff --git a/lexlib/CharacterCategory.cxx b/lexlib/CharacterCategory.cxx
index 7ca3a0f27..0880171e8 100644
--- a/lexlib/CharacterCategory.cxx
+++ b/lexlib/CharacterCategory.cxx
@@ -3704,6 +3704,152 @@ CharacterCategory CategoriseCharacter(int character) {
return static_cast<CharacterCategory>(*(placeAfter-1) & maskCategory);
}
+// Implementation of character sets recommended for identifiers in Unicode Standard Annex #31.
+// http://unicode.org/reports/tr31/
+
+namespace {
+
+enum class OtherID { oidNone, oidStart, oidContinue };
+
+// Some characters are treated as valid for identifiers even
+// though most characters from their category are not.
+// Values copied from http://www.unicode.org/Public/9.0.0/ucd/PropList.txt
+OtherID OtherIDOfCharacter(int character) {
+ if (
+ (character == 0x1885) || // MONGOLIAN LETTER ALI GALI BALUDA
+ (character == 0x1886) || // MONGOLIAN LETTER ALI GALI THREE BALUDA
+ (character == 0x2118) || // SCRIPT CAPITAL P
+ (character == 0x212E) || // ESTIMATED SYMBOL
+ (character == 0x309B) || // KATAKANA-HIRAGANA VOICED SOUND MARK
+ (character == 0x309C)) { // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+ return OtherID::oidStart;
+ } else if (
+ (character == 0x00B7) || // MIDDLE DOT
+ (character == 0x0387) || // GREEK ANO TELEIA
+ ((character >= 0x1369) && (character <= 0x1371)) || // ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE
+ (character == 0x19DA)) { // NEW TAI LUE THAM DIGIT ONE
+ return OtherID::oidContinue;
+ } else {
+ return OtherID::oidNone;
+ }
+}
+
+// Determine if a character is in Ll|Lu|Lt|Lm|Lo|Nl|Mn|Mc|Nd|Pc and has
+// Pattern_Syntax|Pattern_White_Space.
+// As of Unicode 9, only VERTICAL TILDE which is in Lm and has Pattern_Syntax matches.
+// Should really generate from PropList.txt a list of Pattern_Syntax and Pattern_White_Space.
+bool IsIdPattern(int character) {
+ return character == 0x2E2F;
+}
+
+bool OmitXidStart(int character) {
+ switch (character) {
+ case 0x037A: // GREEK YPOGEGRAMMENI
+ case 0x0E33: // THAI CHARACTER SARA AM
+ case 0x0EB3: // LAO VOWEL SIGN AM
+ case 0x309B: // KATAKANA-HIRAGANA VOICED SOUND MARK
+ case 0x309C: // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+ case 0xFC5E: // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM
+ case 0xFC5F: // ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM
+ case 0xFC60: // ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM
+ case 0xFC61: // ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM
+ case 0xFC62: // ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM
+ case 0xFC63: // ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM
+ case 0xFDFA: // ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM
+ case 0xFDFB: // ARABIC LIGATURE JALLAJALALOUHOU
+ case 0xFE70: // ARABIC FATHATAN ISOLATED FORM
+ case 0xFE72: // ARABIC DAMMATAN ISOLATED FORM
+ case 0xFE74: // ARABIC KASRATAN ISOLATED FORM
+ case 0xFE76: // ARABIC FATHA ISOLATED FORM
+ case 0xFE78: // ARABIC DAMMA ISOLATED FORM
+ case 0xFE7A: // ARABIC KASRA ISOLATED FORM
+ case 0xFE7C: // ARABIC SHADDA ISOLATED FORM
+ case 0xFE7E: // ARABIC SUKUN ISOLATED FORM
+ case 0xFF9E: // HALFWIDTH KATAKANA VOICED SOUND MARK
+ case 0xFF9F: // HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool OmitXidContinue(int character) {
+ switch (character) {
+ case 0x037A: // GREEK YPOGEGRAMMENI
+ case 0x309B: // KATAKANA-HIRAGANA VOICED SOUND MARK
+ case 0x309C: // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+ case 0xFC5E: // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM
+ case 0xFC5F: // ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM
+ case 0xFC60: // ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM
+ case 0xFC61: // ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM
+ case 0xFC62: // ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM
+ case 0xFC63: // ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM
+ case 0xFDFA: // ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM
+ case 0xFDFB: // ARABIC LIGATURE JALLAJALALOUHOU
+ case 0xFE70: // ARABIC FATHATAN ISOLATED FORM
+ case 0xFE72: // ARABIC DAMMATAN ISOLATED FORM
+ case 0xFE74: // ARABIC KASRATAN ISOLATED FORM
+ case 0xFE76: // ARABIC FATHA ISOLATED FORM
+ case 0xFE78: // ARABIC DAMMA ISOLATED FORM
+ case 0xFE7A: // ARABIC KASRA ISOLATED FORM
+ case 0xFE7C: // ARABIC SHADDA ISOLATED FORM
+ case 0xFE7E: // ARABIC SUKUN ISOLATED FORM
+ return true;
+ default:
+ return false;
+ }
+}
+
+}
+
+// UAX #31 defines ID_Start as
+// [[:L:][:Nl:][:Other_ID_Start:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]]
+bool IsIdStart(int character) {
+ if (IsIdPattern(character)) {
+ return false;
+ }
+ OtherID oid = OtherIDOfCharacter(character);
+ if (oid == OtherID::oidStart) {
+ return true;
+ }
+ CharacterCategory c = CategoriseCharacter(character);
+ return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo
+ || c == ccNl);
+}
+
+// UAX #31 defines ID_Continue as
+// [[:ID_Start:][:Mn:][:Mc:][:Nd:][:Pc:][:Other_ID_Continue:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]]
+bool IsIdContinue(int character) {
+ if (IsIdPattern(character)) {
+ return false;
+ }
+ OtherID oid = OtherIDOfCharacter(character);
+ if (oid != OtherID::oidNone) {
+ return true;
+ }
+ CharacterCategory c = CategoriseCharacter(character);
+ return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo
+ || c == ccNl || c == ccMn || c == ccMc || c == ccNd || c == ccPc);
+}
+
+// XID_Start is ID_Start modified for Normalization Form KC in UAX #31
+bool IsXidStart(int character) {
+ if (OmitXidStart(character)) {
+ return false;
+ } else {
+ return IsIdStart(character);
+ }
+}
+
+// XID_Continue is ID_Continue modified for Normalization Form KC in UAX #31
+bool IsXidContinue(int character) {
+ if (OmitXidContinue(character)) {
+ return false;
+ } else {
+ return IsIdContinue(character);
+ }
+}
+
#ifdef SCI_NAMESPACE
}
#endif
diff --git a/lexlib/CharacterCategory.h b/lexlib/CharacterCategory.h
index c8600504b..05ea4187b 100644
--- a/lexlib/CharacterCategory.h
+++ b/lexlib/CharacterCategory.h
@@ -24,6 +24,12 @@ enum CharacterCategory {
CharacterCategory CategoriseCharacter(int character);
+// Common definitions of allowable characters in identifiers from UAX #31.
+bool IsIdStart(int character);
+bool IsIdContinue(int character);
+bool IsXidStart(int character);
+bool IsXidContinue(int character);
+
#ifdef SCI_NAMESPACE
}
#endif