Allow Python 3 unicode identifiers

author: John Ehresman <jpe@wingware.com> 2016-12-22 16:05:30 -0500
committer: John Ehresman <jpe@wingware.com> 2016-12-22 16:05:30 -0500
commit: 46d360afd32fe73f5f5bb2eea1a352f32393c0c0 (patch)
tree: a0c963ae283795044360d4417a9ffa21b47caecc
parent: 451f73754a58e0c03f592c86ed2b39babe0ff60c (diff)
download: scintilla-mirror-46d360afd32fe73f5f5bb2eea1a352f32393c0c0.tar.gz
1 files changed, 34 insertions, 8 deletions
diff --git a/lexers/LexPython.cxx b/lexers/LexPython.cxx
index 3b8ce1292..a5a470347 100644
--- a/lexers/LexPython.cxx
+++ b/lexers/LexPython.cxx
@@ -25,6 +25,7 @@
 #include "Accessor.h"
 #include "StyleContext.h"
 #include "CharacterSet.h"
+#include "CharacterCategory.h"
 #include "LexerModule.h"
 #include "OptionSet.h"
 #include "SubStyles.h"
@@ -110,12 +111,32 @@ int GetPyStringState(Accessor &styler, Sci_Position i, Sci_PositionU *nextIndex,
 	}
 }
 
-inline bool IsAWordChar(int ch) {
-	return (ch < 0x80) && (isalnum(ch) || ch == '.' || ch == '_');
+inline bool IsAWordChar(int ch, bool unicodeIdentifiers) {
+	if (ch < 0x80)
+		return (isalnum(ch) || ch == '.' || ch == '_');
+
+	if (!unicodeIdentifiers)
+		return false;
+
+	// Approximation, Python uses the XID_Continue set from unicode data
+	// see http://unicode.org/reports/tr31/
+	CharacterCategory c = CategoriseCharacter(ch);
+	return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo
+		|| c == ccNl || c == ccMn || c == ccMc || c == ccNd || c == ccPc);
 }
 
-inline bool IsAWordStart(int ch) {
-	return (ch < 0x80) && (isalnum(ch) || ch == '_');
+inline bool IsAWordStart(int ch, bool unicodeIdentifiers) {
+	if (ch < 0x80) 
+		return (isalpha(ch) || ch == '_');
+
+	if (!unicodeIdentifiers)
+		return false;
+	
+	// Approximation, Python uses the XID_Start set from unicode data
+	// see http://unicode.org/reports/tr31/
+	CharacterCategory c = CategoriseCharacter(ch);
+	return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo
+		|| c == ccNl);
 }
 
 static bool IsFirstNonWhitespace(Sci_Position pos, Accessor &styler) {
@@ -141,6 +162,7 @@ struct OptionsPython {
 	bool fold;
 	bool foldQuotes;
 	bool foldCompact;
+	bool unicodeIdentifiers;
 
 	OptionsPython() {
 		whingeLevel = 0;
@@ -153,6 +175,7 @@ struct OptionsPython {
 		fold = false;
 		foldQuotes = false;
 		foldCompact = false;
+		unicodeIdentifiers = true;
 	}
 
 	literalsAllowed AllowedLiterals() const {
@@ -208,6 +231,9 @@ struct OptionSetPython : public OptionSet<OptionsPython> {
 
 		DefineProperty("fold.compact", &OptionsPython::foldCompact);
 
+		DefineProperty("lexer.python.unicode.identifiers", &OptionsPython::unicodeIdentifiers,
+			"Set to 0 to not recognise Python 3 unicode identifiers.");
+
 		DefineWordListSets(pythonWordListDesc);
 	}
 };
@@ -418,12 +444,12 @@ void SCI_METHOD LexerPython::Lex(Sci_PositionU startPos, Sci_Position length, in
 			kwLast = kwOther;
 			sc.SetState(SCE_P_DEFAULT);
 		} else if (sc.state == SCE_P_NUMBER) {
-			if (!IsAWordChar(sc.ch) &&
+			if (!IsAWordChar(sc.ch, false) &&
 			        !(!base_n_number && ((sc.ch == '+' || sc.ch == '-') && (sc.chPrev == 'e' || sc.chPrev == 'E')))) {
 				sc.SetState(SCE_P_DEFAULT);
 			}
 		} else if (sc.state == SCE_P_IDENTIFIER) {
-			if ((sc.ch == '.') || (!IsAWordChar(sc.ch))) {
+			if ((sc.ch == '.') || (!IsAWordChar(sc.ch, options.unicodeIdentifiers))) {
 				char s[100];
 				sc.GetCurrent(s, sizeof(s));
 				int style = SCE_P_IDENTIFIER;
@@ -495,7 +521,7 @@ void SCI_METHOD LexerPython::Lex(Sci_PositionU startPos, Sci_Position length, in
 				sc.SetState(SCE_P_DEFAULT);
 			}
 		} else if (sc.state == SCE_P_DECORATOR) {
-			if (!IsAWordChar(sc.ch)) {
+			if (!IsAWordChar(sc.ch, options.unicodeIdentifiers)) {
 				sc.SetState(SCE_P_DEFAULT);
 			}
 		} else if ((sc.state == SCE_P_STRING) || (sc.state == SCE_P_CHARACTER)) {
@@ -589,7 +615,7 @@ void SCI_METHOD LexerPython::Lex(Sci_PositionU startPos, Sci_Position length, in
 				while (nextIndex > (sc.currentPos + 1) && sc.More()) {
 					sc.Forward();
 				}
-			} else if (IsAWordStart(sc.ch)) {
+			} else if (IsAWordStart(sc.ch, options.unicodeIdentifiers)) {
 				sc.SetState(SCE_P_IDENTIFIER);
 			}
 		}
author	John Ehresman <jpe@wingware.com>	2016-12-22 16:05:30 -0500
committer	John Ehresman <jpe@wingware.com>	2016-12-22 16:05:30 -0500
commit	46d360afd32fe73f5f5bb2eea1a352f32393c0c0 (patch)
tree	a0c963ae283795044360d4417a9ffa21b47caecc
parent	451f73754a58e0c03f592c86ed2b39babe0ff60c (diff)
download	scintilla-mirror-46d360afd32fe73f5f5bb2eea1a352f32393c0c0.tar.gz