diff options
| author | nyamatongwe <unknown> | 2012-05-09 10:20:36 +1000 | 
|---|---|---|
| committer | nyamatongwe <unknown> | 2012-05-09 10:20:36 +1000 | 
| commit | 2a96fb57e3c94999c11c3b0634e05a63b4267948 (patch) | |
| tree | bf72d4ad5d7ea0647e5c3f05de650ab51ca00ca2 /lexers/LexOScript.cxx | |
| parent | c69335b78e70e68112637044ddaa3e3213a20576 (diff) | |
| download | scintilla-mirror-2a96fb57e3c94999c11c3b0634e05a63b4267948.tar.gz | |
OScript lexer by Ferdinand Prantl added. Feature #3523018.
Diffstat (limited to 'lexers/LexOScript.cxx')
| -rw-r--r-- | lexers/LexOScript.cxx | 548 | 
1 files changed, 548 insertions, 0 deletions
| diff --git a/lexers/LexOScript.cxx b/lexers/LexOScript.cxx new file mode 100644 index 000000000..9daff34d5 --- /dev/null +++ b/lexers/LexOScript.cxx @@ -0,0 +1,548 @@ +// Scintilla source code edit control +/** @file LexOScript.cxx + ** Lexer for OScript sources; ocx files and/or OSpace dumps. + ** OScript is a programming language used to develop applications for the + ** Livelink server platform. + **/ +// Written by Ferdinand Prantl <prantlf@gmail.com>, inspired by the code from +// LexVB.cxx and LexPascal.cxx. The License.txt file describes the conditions +// under which this software may be distributed. + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <stdarg.h> +#include <assert.h> +#include <ctype.h> + +#include "ILexer.h" +#include "Scintilla.h" +#include "SciLexer.h" + +#include "WordList.h" +#include "LexAccessor.h" +#include "Accessor.h" +#include "StyleContext.h" +#include "CharacterSet.h" +#include "LexerModule.h" + +#ifdef SCI_NAMESPACE +using namespace Scintilla; +#endif + +// ----------------------------------------- +// Functions classifying a single character. + +// This function is generic and should be probably moved to CharSet.h where +// IsAlphaNumeric the others reside. +inline bool IsAlpha(int ch) { +	return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); +} + +static inline bool IsIdentifierChar(int ch) { +	// Identifiers cannot contain non-ASCII letters; a word with non-English +	// language-specific characters cannot be an identifier. +	return IsAlphaNumeric(ch) || ch == '_'; +} + +static inline bool IsIdentifierStart(int ch) { +	// Identifiers cannot contain non-ASCII letters; a word with non-English +	// language-specific characters cannot be an identifier. +	return IsAlpha(ch) || ch == '_'; +} + +static inline bool IsNumberChar(int ch, int chNext) { +	// Numeric constructs are not checked for lexical correctness. They are +	// expected to look like +1.23-E9 but actually any bunch of the following +	// characters will be styled as number. +	// KNOWN PROBLEM: if you put + or - operators immediately after a number +	// and the next operand starts with the letter E, the operator will not be +	// recognized and it will be styled together with the preceding number. +	// This should not occur; at least not often. The coding style recommends +	// putting spaces around operators. +	return IsADigit(ch) || toupper(ch) == 'E' || ch == '.' || +		   ((ch == '-' || ch == '+') && toupper(chNext) == 'E'); +} + +// This function checks for the start or a natural number without any symbols +// or operators as a prefix; the IsPrefixedNumberStart should be called +// immediately after this one to cover all possible numeric constructs. +static inline bool IsNaturalNumberStart(int ch) { +	return IsADigit(ch) != 0; +} + +static inline bool IsPrefixedNumberStart(int ch, int chNext) { +	// KNOWN PROBLEM: if you put + or - operators immediately before a number +	// the operator will not be recognized and it will be styled together with +	// the succeeding number. This should not occur; at least not often. The +	// coding style recommends putting spaces around operators. +	return (ch == '.' || ch == '-' || ch == '+') && IsADigit(chNext); +} + +static inline bool IsOperator(int ch) { +	return strchr("%^&*()-+={}[]:;<>,/?!.~|\\", ch) != NULL; +} + +// --------------------------------------------------------------- +// Functions classifying a token currently processed in the lexer. + +// Checks if the current line starts with the preprocessor directive used +// usually to introduce documentation comments: #ifdef DOC. This method is +// supposed to be called if the line has been recognized as a preprocessor +// directive already. +static bool IsDocCommentStart(StyleContext &sc) { +	// Check the line back to its start only if the end looks promising. +	if (sc.LengthCurrent() == 10 && !IsAlphaNumeric(sc.ch)) { +		char s[11]; +		sc.GetCurrentLowered(s, sizeof(s)); +		return strcmp(s, "#ifdef doc") == 0; +	} +	return false; +} + +// Checks if the current line starts with the preprocessor directive that +// is complementary to the #ifdef DOC start: #endif. This method is supposed +// to be called if the current state point to the documentation comment. +// QUESTIONAL ASSUMPTION: The complete #endif directive is not checked; just +// the starting #e. However, there is no other preprocessor directive with +// the same starting letter and thus this optimization should always work. +static bool IsDocCommentEnd(StyleContext &sc) { +	return sc.ch == '#' && sc.chNext == 'e'; +} + +class IdentifierClassifier { +	WordList &keywords;  // Passed from keywords property. +	WordList &constants; // Passed from keywords2 property. +	WordList &operators; // Passed from keywords3 property. +	WordList &types;     // Passed from keywords4 property. +	WordList &functions; // Passed from keywords5 property. +	WordList &objects;   // Passed from keywords6 property. + +	IdentifierClassifier(IdentifierClassifier const&); +	IdentifierClassifier& operator=(IdentifierClassifier const&); + +public: +	IdentifierClassifier(WordList *keywordlists[]) : +		keywords(*keywordlists[0]), constants(*keywordlists[1]), +		operators(*keywordlists[2]), types(*keywordlists[3]), +		functions(*keywordlists[4]), objects(*keywordlists[5]) +	{} + +	void ClassifyIdentifier(StyleContext &sc) { +		// Opening parenthesis following an identifier makes it a possible +		// function call. +		// KNOWN PROBLEM: If some whitespace is inserted between the +		// identifier and the parenthesis they will not be able to be +		// recognized as a function call. This should not occur; at +		// least not often. Such coding style would be weird. +		if (sc.Match('(')) { +			char s[100]; +			sc.GetCurrentLowered(s, sizeof(s)); +			// Before an opening brace can be control statements and +			// operators too; function call is the last option. +			if (keywords.InList(s)) { +				sc.ChangeState(SCE_OSCRIPT_KEYWORD); +			} else if (operators.InList(s)) { +				sc.ChangeState(SCE_OSCRIPT_OPERATOR); +			} else if (functions.InList(s)) { +				sc.ChangeState(SCE_OSCRIPT_FUNCTION); +			} else { +				sc.ChangeState(SCE_OSCRIPT_METHOD); +			} +			sc.SetState(SCE_OSCRIPT_OPERATOR); +		} else { +			char s[100]; +			sc.GetCurrentLowered(s, sizeof(s)); +			// A dot following an identifier means an access to an object +			// member. The related object identifier can be special. +			// KNOWN PROBLEM: If there is whitespace between the identifier +			// and the following dot, the identifier will not be recognized +			// as an object in an object member access. If it is one of the +			// listed static objects it will not be styled. +			if (sc.Match('.') && objects.InList(s)) { +				sc.ChangeState(SCE_OSCRIPT_OBJECT); +				sc.SetState(SCE_OSCRIPT_OPERATOR); +			} else { +				if (keywords.InList(s)) { +					sc.ChangeState(SCE_OSCRIPT_KEYWORD); +				} else if (constants.InList(s)) { +					sc.ChangeState(SCE_OSCRIPT_CONSTANT); +				} else if (operators.InList(s)) { +					sc.ChangeState(SCE_OSCRIPT_OPERATOR); +				} else if (types.InList(s)) { +					sc.ChangeState(SCE_OSCRIPT_TYPE); +				} else if (functions.InList(s)) { +					sc.ChangeState(SCE_OSCRIPT_FUNCTION); +				} +				sc.SetState(SCE_OSCRIPT_DEFAULT); +			} +		} +	} +}; + +// ------------------------------------------------ +// Function colourising an excerpt of OScript code. + +static void ColouriseOScriptDoc(unsigned int startPos, int length, +								int initStyle, WordList *keywordlists[], +								Accessor &styler) { +	// I wonder how whole-line styles ended by EOLN can escape the resetting +	// code in the loop below and overflow to the next line. Let us make sure +	// that a new line does not start with them carried from the previous one. +	// NOTE: An overflowing string is intentionally not checked; it reminds +	// the developer that the string must be ended on the same line. +	if (initStyle == SCE_OSCRIPT_LINE_COMMENT || +			initStyle == SCE_OSCRIPT_PREPROCESSOR) { +		initStyle = SCE_OSCRIPT_DEFAULT; +	} + +	styler.StartAt(startPos); +	StyleContext sc(startPos, length, initStyle, styler); +	IdentifierClassifier identifierClassifier(keywordlists); + +	// It starts with true at the beginning of a line and changes to false as +	// soon as the first non-whitespace character has been processed. +	bool isFirstToken = true; +	// It starts with true at the beginning of a line and changes to false as +	// soon as the first identifier on the line is passed by. +	bool isFirstIdentifier = true;  +	// It becomes false when #ifdef DOC (the preprocessor directive often +	// used to start a documentation comment) is encountered and remain false +	// until the end of the documentation block is not detected. This is done +	// by checking for the complementary #endif preprocessor directive. +	bool endDocComment = false;  + +	for (; sc.More(); sc.Forward()) { + +		if (sc.atLineStart) { +			isFirstToken = true; +			isFirstIdentifier = true; +		// Detect the current state is neither whitespace nor identifier. It +		// means that no next identifier can be the first token on the line. +		} else if (isFirstIdentifier && sc.state != SCE_OSCRIPT_DEFAULT && +				   sc.state != SCE_OSCRIPT_IDENTIFIER) { +			isFirstIdentifier = false; +		} + +		// Check if the current state should be changed. +		if (sc.state == SCE_OSCRIPT_OPERATOR) { +			// Multiple-symbol operators are marked by single characters. +			sc.SetState(SCE_OSCRIPT_DEFAULT); +		} else if (sc.state == SCE_OSCRIPT_IDENTIFIER) { +			if (!IsIdentifierChar(sc.ch)) { +				// Colon after an identifier makes it a label if it is the +				// first token on the line. +				// KNOWN PROBLEM: If some whitespace is inserted between the +				// identifier and the colon they will not be recognized as a +				// label. This should not occur; at least not often. It would +				// make the code structure less legible and examples in the +				// Livelink documentation do not show it. +				if (sc.Match(':') && isFirstIdentifier) { +					sc.ChangeState(SCE_OSCRIPT_LABEL); +					sc.ForwardSetState(SCE_OSCRIPT_DEFAULT); +				} else { +					identifierClassifier.ClassifyIdentifier(sc); +				} +				// Avoid a sequence of two words be mistaken for a label. A +				// switch case would be an example. +				isFirstIdentifier = false; +			} +		} else if (sc.state == SCE_OSCRIPT_GLOBAL) { +			if (!IsIdentifierChar(sc.ch)) { +				sc.SetState(SCE_OSCRIPT_DEFAULT); +			} +		} else if (sc.state == SCE_OSCRIPT_PROPERTY) { +			if (!IsIdentifierChar(sc.ch)) { +				// Any member access introduced by the dot operator is +				// initially marked as a property access. If an opening +				// parenthesis is detected later it is changed to method call. +				// KNOWN PROBLEM: The same as at the function call recognition +				// for SCE_OSCRIPT_IDENTIFIER above. +				if (sc.Match('(')) { +					sc.ChangeState(SCE_OSCRIPT_METHOD); +				} +				sc.SetState(SCE_OSCRIPT_DEFAULT); +			} +		} else if (sc.state == SCE_OSCRIPT_NUMBER) { +			if (!IsNumberChar(sc.ch, sc.chNext)) { +				sc.SetState(SCE_OSCRIPT_DEFAULT); +			} +		} else if (sc.state == SCE_OSCRIPT_SINGLEQUOTE_STRING) { +			if (sc.ch == '\'') { +				// Two consequential apostrophes convert to a single one. +				if (sc.chNext == '\'') { +					sc.Forward(); +				} else { +					sc.ForwardSetState(SCE_OSCRIPT_DEFAULT); +				} +			} else if (sc.atLineEnd) { +				sc.ForwardSetState(SCE_OSCRIPT_DEFAULT); +			} +		} else if (sc.state == SCE_OSCRIPT_DOUBLEQUOTE_STRING) { +			if (sc.ch == '\"') { +				// Two consequential quotation marks convert to a single one. +				if (sc.chNext == '\"') { +					sc.Forward(); +				} else { +					sc.ForwardSetState(SCE_OSCRIPT_DEFAULT); +				} +			} else if (sc.atLineEnd) { +				sc.ForwardSetState(SCE_OSCRIPT_DEFAULT); +			} +		} else if (sc.state == SCE_OSCRIPT_BLOCK_COMMENT) { +			if (sc.Match('*', '/')) { +				sc.Forward(); +				sc.ForwardSetState(SCE_OSCRIPT_DEFAULT); +			} +		} else if (sc.state == SCE_OSCRIPT_LINE_COMMENT) { +			if (sc.atLineEnd) { +				sc.ForwardSetState(SCE_OSCRIPT_DEFAULT); +			} +		} else if (sc.state == SCE_OSCRIPT_PREPROCESSOR) { +			if (IsDocCommentStart(sc)) { +				sc.ChangeState(SCE_OSCRIPT_DOC_COMMENT); +				endDocComment = false; +			} else if (sc.atLineEnd) { +				sc.ForwardSetState(SCE_OSCRIPT_DEFAULT); +			} +		} else if (sc.state == SCE_OSCRIPT_DOC_COMMENT) { +			// KNOWN PROBLEM: The first line detected that would close a +			// conditional preprocessor block (#endif) the documentation +			// comment block will end. (Nested #if-#endif blocks are not +			// supported. Hopefully it will not occur often that a line +			// within the text block would stat with #endif. +			if (isFirstToken && IsDocCommentEnd(sc)) { +				endDocComment = true; +			} else if (sc.atLineEnd && endDocComment) { +				sc.ForwardSetState(SCE_OSCRIPT_DEFAULT); +			} +		} + +		// Check what state starts with the current character. +		if (sc.state == SCE_OSCRIPT_DEFAULT) { +			if (sc.Match('\'')) { +				sc.SetState(SCE_OSCRIPT_SINGLEQUOTE_STRING); +			} else if (sc.Match('\"')) { +				sc.SetState(SCE_OSCRIPT_DOUBLEQUOTE_STRING); +			} else if (sc.Match('/', '/')) { +				sc.SetState(SCE_OSCRIPT_LINE_COMMENT); +				sc.Forward(); +			} else if (sc.Match('/', '*')) { +				sc.SetState(SCE_OSCRIPT_BLOCK_COMMENT); +				sc.Forward(); +			} else if (isFirstToken && sc.Match('#')) { +				sc.SetState(SCE_OSCRIPT_PREPROCESSOR); +			} else if (sc.Match('$')) { +				// Both process-global ($xxx) and thread-global ($$xxx) +				// variables are handled as one global. +				sc.SetState(SCE_OSCRIPT_GLOBAL); +			} else if (IsNaturalNumberStart(sc.ch)) { +				sc.SetState(SCE_OSCRIPT_NUMBER); +			} else if (IsPrefixedNumberStart(sc.ch, sc.chNext)) { +				sc.SetState(SCE_OSCRIPT_NUMBER); +				sc.Forward(); +			} else if (sc.Match('.') && IsIdentifierStart(sc.chNext)) { +				// Every object member access is marked as a property access +				// initially. The decision between property and method is made +				// after parsing the identifier and looking what comes then. +				// KNOWN PROBLEM: If there is whitespace between the following +				// identifier and the dot, the dot will not be recognized +				// as a member accessing operator. In turn, the identifier +				// will not be recognizable as a property or a method too. +				sc.SetState(SCE_OSCRIPT_OPERATOR); +				sc.Forward(); +				sc.SetState(SCE_OSCRIPT_PROPERTY); +			} else if (IsIdentifierStart(sc.ch)) { +				sc.SetState(SCE_OSCRIPT_IDENTIFIER); +			} else if (IsOperator(sc.ch)) { +				sc.SetState(SCE_OSCRIPT_OPERATOR); +			} +		} + +		if (isFirstToken && !IsASpaceOrTab(sc.ch)) { +			isFirstToken = false; +		} +	} + +	sc.Complete(); +} + +// ------------------------------------------ +// Functions supporting OScript code folding. + +static inline bool IsBlockComment(int style) { +	return style == SCE_OSCRIPT_BLOCK_COMMENT; +} + +static bool IsLineComment(int line, Accessor &styler) { +	int pos = styler.LineStart(line); +	int eolPos = styler.LineStart(line + 1) - 1; +	for (int i = pos; i < eolPos; i++) { +		char ch = styler[i]; +		char chNext = styler.SafeGetCharAt(i + 1); +		int style = styler.StyleAt(i); +		if (ch == '/' && chNext == '/' && style == SCE_OSCRIPT_LINE_COMMENT) { +			return true; +		} else if (!IsASpaceOrTab(ch)) { +			return false; +		} +	} +	return false; +} + +static inline bool IsPreprocessor(int style) { +	return style == SCE_OSCRIPT_PREPROCESSOR || +		   style == SCE_OSCRIPT_DOC_COMMENT; +} + +static void GetRangeLowered(unsigned int start, unsigned int end, +							Accessor &styler, char *s, unsigned int len) { +	unsigned int i = 0; +	while (i < end - start + 1 && i < len - 1) { +		s[i] = static_cast<char>(tolower(styler[start + i])); +		i++; +	} +	s[i] = '\0'; +} + +static void GetForwardWordLowered(unsigned int start, Accessor &styler, +								  char *s, unsigned int len) { +	unsigned int i = 0; +	while (i < len - 1 && IsAlpha(styler.SafeGetCharAt(start + i))) { +		s[i] = static_cast<char>(tolower(styler.SafeGetCharAt(start + i))); +		i++; +	} +	s[i] = '\0'; +} + +static void UpdatePreprocessorFoldLevel(int &levelCurrent, +		unsigned int startPos, Accessor &styler) { +	char s[7]; // Size of the longest possible keyword + null. +	GetForwardWordLowered(startPos, styler, s, sizeof(s)); + +	if (strcmp(s, "ifdef") == 0 || +		strcmp(s, "ifndef") == 0) { +		levelCurrent++; +	} else if (strcmp(s, "endif") == 0) { +		levelCurrent--; +		if (levelCurrent < SC_FOLDLEVELBASE) { +			levelCurrent = SC_FOLDLEVELBASE; +		} +	} +} + +static void UpdateKeywordFoldLevel(int &levelCurrent, unsigned int lastStart, +		unsigned int currentPos, Accessor &styler) { +	char s[9]; +	GetRangeLowered(lastStart, currentPos, styler, s, sizeof(s)); + +	if (strcmp(s, "if") == 0 || strcmp(s, "for") == 0 || +		strcmp(s, "switch") == 0 || strcmp(s, "function") == 0 || +		strcmp(s, "while") == 0 || strcmp(s, "repeat") == 0) { +		levelCurrent++; +	} else if (strcmp(s, "end") == 0 || strcmp(s, "until") == 0) { +		levelCurrent--; +		if (levelCurrent < SC_FOLDLEVELBASE) { +			levelCurrent = SC_FOLDLEVELBASE; +		} +	} +} + +// ------------------------------ +// Function folding OScript code. + +static void FoldOScriptDoc(unsigned int startPos, int length, int initStyle, +						   WordList *[], Accessor &styler) { +	bool foldComment = styler.GetPropertyInt("fold.comment") != 0; +	bool foldPreprocessor = styler.GetPropertyInt("fold.preprocessor") != 0; +	bool foldCompact = styler.GetPropertyInt("fold.compact", 1) != 0; +	int endPos = startPos + length; +	int visibleChars = 0; +	int lineCurrent = styler.GetLine(startPos); +	int levelPrev = styler.LevelAt(lineCurrent) & SC_FOLDLEVELNUMBERMASK; +	int levelCurrent = levelPrev; +	char chNext = styler[startPos]; +	int styleNext = styler.StyleAt(startPos); +	int style = initStyle; +	int lastStart = 0; + +	for (int i = startPos; i < endPos; i++) { +		char ch = chNext; +		chNext = styler.SafeGetCharAt(i + 1); +		int stylePrev = style; +		style = styleNext; +		styleNext = styler.StyleAt(i + 1); +		bool atLineEnd = (ch == '\r' && chNext != '\n') || (ch == '\n'); + +		if (foldComment && IsBlockComment(style)) { +			if (!IsBlockComment(stylePrev)) { +				levelCurrent++; +			} else if (!IsBlockComment(styleNext) && !atLineEnd) { +				// Comments do not end at end of line and the next character +				// may not be styled. +				levelCurrent--; +			} +		} +		if (foldComment && atLineEnd && IsLineComment(lineCurrent, styler)) { +			if (!IsLineComment(lineCurrent - 1, styler) && +				IsLineComment(lineCurrent + 1, styler)) +				levelCurrent++; +			else if (IsLineComment(lineCurrent - 1, styler) && +					 !IsLineComment(lineCurrent+1, styler)) +				levelCurrent--; +		} +		if (foldPreprocessor) { +			if (ch == '#' && IsPreprocessor(style)) { +				UpdatePreprocessorFoldLevel(levelCurrent, i + 1, styler); +			} +		} + +		if (stylePrev != SCE_OSCRIPT_KEYWORD && style == SCE_OSCRIPT_KEYWORD) { +			lastStart = i; +		} +		if (stylePrev == SCE_OSCRIPT_KEYWORD) { +			if(IsIdentifierChar(ch) && !IsIdentifierChar(chNext)) { +				UpdateKeywordFoldLevel(levelCurrent, lastStart, i, styler); +			} +		} + +		if (!IsASpace(ch)) +			visibleChars++; + +		if (atLineEnd) { +			int level = levelPrev; +			if (visibleChars == 0 && foldCompact) +				level |= SC_FOLDLEVELWHITEFLAG; +			if ((levelCurrent > levelPrev) && (visibleChars > 0)) +				level |= SC_FOLDLEVELHEADERFLAG; +			if (level != styler.LevelAt(lineCurrent)) { +				styler.SetLevel(lineCurrent, level); +			} +			lineCurrent++; +			levelPrev = levelCurrent; +			visibleChars = 0; +		} +	} + +	// If we did not reach EOLN in the previous loop, store the line level and +	// whitespace information. The rest will be filled in later. +	int lev = levelPrev; +	if (visibleChars == 0 && foldCompact) +		lev |= SC_FOLDLEVELWHITEFLAG; +	styler.SetLevel(lineCurrent, lev); +} + +// -------------------------------------------- +// Declaration of the OScript lexer descriptor. + +static const char * const oscriptWordListDesc[] = { +	"Keywords and reserved words", +	"Literal constants", +	"Literal operators", +	"Built-in value and reference types", +	"Built-in global functions", +	"Built-in static objects", +	0 +}; + +LexerModule lmOScript(SCLEX_OSCRIPT, ColouriseOScriptDoc, "oscript", FoldOScriptDoc, oscriptWordListDesc); | 
