aboutsummaryrefslogtreecommitdiffhomepage
path: root/lexlib/CharacterCategory.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'lexlib/CharacterCategory.cxx')
-rw-r--r--lexlib/CharacterCategory.cxx146
1 files changed, 146 insertions, 0 deletions
diff --git a/lexlib/CharacterCategory.cxx b/lexlib/CharacterCategory.cxx
index 7ca3a0f27..0880171e8 100644
--- a/lexlib/CharacterCategory.cxx
+++ b/lexlib/CharacterCategory.cxx
@@ -3704,6 +3704,152 @@ CharacterCategory CategoriseCharacter(int character) {
return static_cast<CharacterCategory>(*(placeAfter-1) & maskCategory);
}
+// Implementation of character sets recommended for identifiers in Unicode Standard Annex #31.
+// http://unicode.org/reports/tr31/
+
+namespace {
+
+enum class OtherID { oidNone, oidStart, oidContinue };
+
+// Some characters are treated as valid for identifiers even
+// though most characters from their category are not.
+// Values copied from http://www.unicode.org/Public/9.0.0/ucd/PropList.txt
+OtherID OtherIDOfCharacter(int character) {
+ if (
+ (character == 0x1885) || // MONGOLIAN LETTER ALI GALI BALUDA
+ (character == 0x1886) || // MONGOLIAN LETTER ALI GALI THREE BALUDA
+ (character == 0x2118) || // SCRIPT CAPITAL P
+ (character == 0x212E) || // ESTIMATED SYMBOL
+ (character == 0x309B) || // KATAKANA-HIRAGANA VOICED SOUND MARK
+ (character == 0x309C)) { // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+ return OtherID::oidStart;
+ } else if (
+ (character == 0x00B7) || // MIDDLE DOT
+ (character == 0x0387) || // GREEK ANO TELEIA
+ ((character >= 0x1369) && (character <= 0x1371)) || // ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE
+ (character == 0x19DA)) { // NEW TAI LUE THAM DIGIT ONE
+ return OtherID::oidContinue;
+ } else {
+ return OtherID::oidNone;
+ }
+}
+
+// Determine if a character is in Ll|Lu|Lt|Lm|Lo|Nl|Mn|Mc|Nd|Pc and has
+// Pattern_Syntax|Pattern_White_Space.
+// As of Unicode 9, only VERTICAL TILDE which is in Lm and has Pattern_Syntax matches.
+// Should really generate from PropList.txt a list of Pattern_Syntax and Pattern_White_Space.
+bool IsIdPattern(int character) {
+ return character == 0x2E2F;
+}
+
+bool OmitXidStart(int character) {
+ switch (character) {
+ case 0x037A: // GREEK YPOGEGRAMMENI
+ case 0x0E33: // THAI CHARACTER SARA AM
+ case 0x0EB3: // LAO VOWEL SIGN AM
+ case 0x309B: // KATAKANA-HIRAGANA VOICED SOUND MARK
+ case 0x309C: // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+ case 0xFC5E: // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM
+ case 0xFC5F: // ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM
+ case 0xFC60: // ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM
+ case 0xFC61: // ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM
+ case 0xFC62: // ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM
+ case 0xFC63: // ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM
+ case 0xFDFA: // ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM
+ case 0xFDFB: // ARABIC LIGATURE JALLAJALALOUHOU
+ case 0xFE70: // ARABIC FATHATAN ISOLATED FORM
+ case 0xFE72: // ARABIC DAMMATAN ISOLATED FORM
+ case 0xFE74: // ARABIC KASRATAN ISOLATED FORM
+ case 0xFE76: // ARABIC FATHA ISOLATED FORM
+ case 0xFE78: // ARABIC DAMMA ISOLATED FORM
+ case 0xFE7A: // ARABIC KASRA ISOLATED FORM
+ case 0xFE7C: // ARABIC SHADDA ISOLATED FORM
+ case 0xFE7E: // ARABIC SUKUN ISOLATED FORM
+ case 0xFF9E: // HALFWIDTH KATAKANA VOICED SOUND MARK
+ case 0xFF9F: // HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool OmitXidContinue(int character) {
+ switch (character) {
+ case 0x037A: // GREEK YPOGEGRAMMENI
+ case 0x309B: // KATAKANA-HIRAGANA VOICED SOUND MARK
+ case 0x309C: // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+ case 0xFC5E: // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM
+ case 0xFC5F: // ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM
+ case 0xFC60: // ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM
+ case 0xFC61: // ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM
+ case 0xFC62: // ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM
+ case 0xFC63: // ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM
+ case 0xFDFA: // ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM
+ case 0xFDFB: // ARABIC LIGATURE JALLAJALALOUHOU
+ case 0xFE70: // ARABIC FATHATAN ISOLATED FORM
+ case 0xFE72: // ARABIC DAMMATAN ISOLATED FORM
+ case 0xFE74: // ARABIC KASRATAN ISOLATED FORM
+ case 0xFE76: // ARABIC FATHA ISOLATED FORM
+ case 0xFE78: // ARABIC DAMMA ISOLATED FORM
+ case 0xFE7A: // ARABIC KASRA ISOLATED FORM
+ case 0xFE7C: // ARABIC SHADDA ISOLATED FORM
+ case 0xFE7E: // ARABIC SUKUN ISOLATED FORM
+ return true;
+ default:
+ return false;
+ }
+}
+
+}
+
+// UAX #31 defines ID_Start as
+// [[:L:][:Nl:][:Other_ID_Start:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]]
+bool IsIdStart(int character) {
+ if (IsIdPattern(character)) {
+ return false;
+ }
+ OtherID oid = OtherIDOfCharacter(character);
+ if (oid == OtherID::oidStart) {
+ return true;
+ }
+ CharacterCategory c = CategoriseCharacter(character);
+ return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo
+ || c == ccNl);
+}
+
+// UAX #31 defines ID_Continue as
+// [[:ID_Start:][:Mn:][:Mc:][:Nd:][:Pc:][:Other_ID_Continue:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]]
+bool IsIdContinue(int character) {
+ if (IsIdPattern(character)) {
+ return false;
+ }
+ OtherID oid = OtherIDOfCharacter(character);
+ if (oid != OtherID::oidNone) {
+ return true;
+ }
+ CharacterCategory c = CategoriseCharacter(character);
+ return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo
+ || c == ccNl || c == ccMn || c == ccMc || c == ccNd || c == ccPc);
+}
+
+// XID_Start is ID_Start modified for Normalization Form KC in UAX #31
+bool IsXidStart(int character) {
+ if (OmitXidStart(character)) {
+ return false;
+ } else {
+ return IsIdStart(character);
+ }
+}
+
+// XID_Continue is ID_Continue modified for Normalization Form KC in UAX #31
+bool IsXidContinue(int character) {
+ if (OmitXidContinue(character)) {
+ return false;
+ } else {
+ return IsIdContinue(character);
+ }
+}
+
#ifdef SCI_NAMESPACE
}
#endif