diff options
author | mitchell <unknown> | 2018-03-26 09:07:42 -0400 |
---|---|---|
committer | mitchell <unknown> | 2018-03-26 09:07:42 -0400 |
commit | 32112078c485b76ffca26c2261f005b48cb80ace (patch) | |
tree | 10593bdd5e1119d55ec2a5f82aebbd626553b1ad | |
parent | 08de462c79ec32166e02d0661fe939e8d3e1c828 (diff) | |
download | scintilla-mirror-32112078c485b76ffca26c2261f005b48cb80ace.tar.gz |
Removed unwanted files in lexlua/.rel-3-8-0
-rw-r--r-- | lexlua/html2.lua | 147 | ||||
-rw-r--r-- | lexlua/lexer2.lua | 1723 | ||||
-rw-r--r-- | lexlua/mumps.lua | 112 | ||||
-rw-r--r-- | lexlua/ps.lua.orig | 167 |
4 files changed, 0 insertions, 2149 deletions
diff --git a/lexlua/html2.lua b/lexlua/html2.lua deleted file mode 100644 index ad1bd9c87..000000000 --- a/lexlua/html2.lua +++ /dev/null @@ -1,147 +0,0 @@ --- Copyright 2006-2018 Mitchell mitchell.att.foicica.com. See License.txt. --- HTML LPeg lexer. - -local l = require('lexer') -local token, word_match = l.token, l.word_match -local P, R, S, V = lpeg.P, lpeg.R, lpeg.S, lpeg.V - -local lexer = l.new('html') - --- Whitespace. -local ws = token(l.WHITESPACE, l.space^1) -lexer:add_rule('whitespace', ws) - --- Comments. -lexer:add_rule('comment', - token(l.COMMENT, '<!--' * (l.any - '-->')^0 * P('-->')^-1)) - --- Doctype. -lexer:add_rule('doctype', token('doctype', '<!' * word_match('doctype', true) * - (l.any - '>')^1 * '>')) -lexer:add_style('doctype', l.STYLE_COMMENT) - --- Elements. -local known_element = token('element', '<' * P('/')^-1 * word_match([[ - a abbr address area article aside audio b base bdi bdo blockquote body - br button canvas caption cite code col colgroup content data datalist dd - decorator del details dfn div dl dt element em embed fieldset figcaption - figure footer form h1 h2 h3 h4 h5 h6 head header hr html i iframe img input - ins kbd keygen label legend li link main map mark menu menuitem meta meter - nav noscript object ol optgroup option output p param pre progress q rp rt - ruby s samp script section select shadow small source spacer span strong - style sub summary sup table tbody td template textarea tfoot th thead time - title tr track u ul var video wbr -]], true)) -lexer:add_style('element', l.STYLE_KEYWORD) -local unknown_element = token('unknown_element', '<' * P('/')^-1 * l.word) -lexer:add_style('unknown_element', l.STYLE_KEYWORD..',italics') -local element = known_element + unknown_element -lexer:add_rule('element', element) - --- Closing tags. -local tag_close = token('element', P('/')^-1 * '>') -lexer:add_rule('tag_close', tag_close) - --- Attributes. -local known_attribute = token('attribute', word_match([[ - accept accept-charset accesskey action align alt async autocomplete autofocus - autoplay bgcolor border buffered challenge charset checked cite class code - codebase color cols colspan content contenteditable contextmenu controls - coords data data- datetime default defer dir dirname disabled download - draggable dropzone enctype for form headers height hidden high href hreflang - http-equiv icon id ismap itemprop keytype kind label lang language list - loop low manifest max maxlength media method min multiple name novalidate - open optimum pattern ping placeholder poster preload pubdate radiogroup - readonly rel required reversed role rows rowspan sandbox scope scoped - seamless selected shape size sizes span spellcheck src srcdoc srclang - start step style summary tabindex target title type usemap value width wrap -]], true) + ((P('data-') + 'aria-') * (l.alnum + '-')^1)) -lexer:add_style('attribute', l.STYLE_TYPE) -local unknown_attribute = token('unknown_attribute', l.word) -lexer:add_style('unknown_attribute', l.STYLE_TYPE..',italics') -local attribute = (known_attribute + unknown_attribute) * #(l.space^0 * '=') -lexer:add_rule('attribute', attribute) - --- TODO: performance is terrible on large files. -local in_tag = P(function(input, index) - local before = input:sub(1, index - 1) - local s, e = before:find('<[^>]-$'), before:find('>[^<]-$') - if s and e then return s > e and index or nil end - if s then return index end - return input:find('^[^<]->', index) and index or nil -end) - --- Equals. -local equals = token(l.OPERATOR, '=') --* in_tag ---lexer:add_rule('equals', equals) - --- Strings. -local sq_str = l.delimited_range("'") -local dq_str = l.delimited_range('"') -local string = #S('\'"') * l.last_char_includes('=') * - token(l.STRING, sq_str + dq_str) -lexer:add_rule('string', string) - --- Numbers. -lexer:add_rule('number', #l.digit * l.last_char_includes('=') * - token(l.NUMBER, l.digit^1 * P('%')^-1)) --* in_tag) - --- Entities. -lexer:add_rule('entity', token('entity', '&' * (l.any - l.space - ';')^1 * ';')) -lexer:add_style('entity', l.STYLE_COMMENT) - --- Fold points. -lexer:add_fold_point('element', '<', '</') -lexer:add_fold_point('element', '<', '/>') -lexer:add_fold_point('unknown_element', '<', '</') -lexer:add_fold_point('unknown_element', '<', '/>') -lexer:add_fold_point(l.COMMENT, '<!--', '-->') - --- Tags that start embedded languages. -lexer.embed_start_tag = element * - (ws * attribute * ws^-1 * equals * ws^-1 * string)^0 * - ws^-1 * tag_close -lexer.embed_end_tag = element * tag_close - --- Embedded CSS. -local css = l.load('css') -local style_element = word_match('style', true) -local css_start_rule = #(P('<') * style_element * - ('>' + P(function(input, index) - if input:find('^%s+type%s*=%s*(["\'])text/css%1', index) then - return index - end -end))) * lexer.embed_start_tag -- <style type="text/css"> -local css_end_rule = #('</' * style_element * ws^-1 * '>') * - lexer.embed_end_tag -- </style> -lexer:embed(css, css_start_rule, css_end_rule) - --- Embedded JavaScript. -local js = l.load('javascript') -local script_element = word_match('script', true) -local js_start_rule = #(P('<') * script_element * - ('>' + P(function(input, index) - if input:find('^%s+type%s*=%s*(["\'])text/javascript%1', index) then - return index - end -end))) * lexer.embed_start_tag -- <script type="text/javascript"> -local js_end_rule = #('</' * script_element * ws^-1 * '>') * - lexer.embed_end_tag -- </script> -local js_line_comment = '//' * (l.nonnewline_esc - js_end_rule)^0 -local js_block_comment = '/*' * (l.any - '*/' - js_end_rule)^0 * P('*/')^-1 -js:modify_rule('comment', token(l.COMMENT, js_line_comment + js_block_comment)) -lexer:embed(js, js_start_rule, js_end_rule) - --- Embedded CoffeeScript. -local cs = l.load('coffeescript') -local script_element = word_match('script', true) -local cs_start_rule = #(P('<') * script_element * P(function(input, index) - if input:find('^[^>]+type%s*=%s*(["\'])text/coffeescript%1', index) then - return index - end -end)) * lexer.embed_start_tag -- <script type="text/coffeescript"> -local cs_end_rule = #('</' * script_element * ws^-1 * '>') * - lexer.embed_end_tag -- </script> -lexer:embed(cs, cs_start_rule, cs_end_rule) - -return lexer diff --git a/lexlua/lexer2.lua b/lexlua/lexer2.lua deleted file mode 100644 index b32240aab..000000000 --- a/lexlua/lexer2.lua +++ /dev/null @@ -1,1723 +0,0 @@ --- Copyright 2006-2018 Mitchell mitchell.att.foicica.com. See License.txt. - -local M = {} - ---[=[ This comment is for LuaDoc. ---- --- Lexes Scintilla documents and source code with Lua and LPeg. --- --- ## Overview --- --- Lexers highlight the syntax of source code. Scintilla (the editing component --- behind [Textadept][] and [SciTE][]) traditionally uses static, compiled C++ --- lexers which are notoriously difficult to create and/or extend. On the other --- hand, Lua makes it easy to to rapidly create new lexers, extend existing --- ones, and embed lexers within one another. Lua lexers tend to be more --- readable than C++ lexers too. --- --- Lexers are Parsing Expression Grammars, or PEGs, composed with the Lua --- [LPeg library][]. The following table comes from the LPeg documentation and --- summarizes all you need to know about constructing basic LPeg patterns. This --- module provides convenience functions for creating and working with other --- more advanced patterns and concepts. --- --- Operator | Description --- ---------------------|------------ --- `lpeg.P(string)` | Matches `string` literally. --- `lpeg.P(`_`n`_`)` | Matches exactly _`n`_ characters. --- `lpeg.S(string)` | Matches any character in set `string`. --- `lpeg.R("`_`xy`_`")` | Matches any character between range `x` and `y`. --- `patt^`_`n`_ | Matches at least _`n`_ repetitions of `patt`. --- `patt^-`_`n`_ | Matches at most _`n`_ repetitions of `patt`. --- `patt1 * patt2` | Matches `patt1` followed by `patt2`. --- `patt1 + patt2` | Matches `patt1` or `patt2` (ordered choice). --- `patt1 - patt2` | Matches `patt1` if `patt2` does not match. --- `-patt` | Equivalent to `("" - patt)`. --- `#patt` | Matches `patt` but consumes no input. --- --- The first part of this document deals with rapidly constructing a simple --- lexer. The next part deals with more advanced techniques, such as custom --- coloring and embedding lexers within one another. Following that is a --- discussion about code folding, or being able to tell Scintilla which code --- blocks are "foldable" (temporarily hideable from view). After that are --- instructions on how to use LPeg lexers with the aforementioned Textadept and --- SciTE editors. Finally there are comments on lexer performance and --- limitations. --- --- [LPeg library]: http://www.inf.puc-rio.br/~roberto/lpeg/lpeg.html --- [Textadept]: http://foicica.com/textadept --- [SciTE]: http://scintilla.org/SciTE.html --- --- ## Lexer Basics --- --- The *lexers/* directory contains all lexers, including your new one. Before --- attempting to write one from scratch though, first determine if your --- programming language is similar to any of the 100+ languages supported. If --- so, you may be able to copy and modify that lexer, saving some time and --- effort. The filename of your lexer should be the name of your programming --- language in lower case followed by a *.lua* extension. For example, a new Lua --- lexer has the name *lua.lua*. --- --- Note: Try to refrain from using one-character language names like "c", "d", --- or "r". For example, Scintillua uses "ansi_c", "dmd", and "rstats", --- respectively. --- --- ### New Lexer Template --- --- There is a *lexers/template.txt* file that contains a simple template for a --- new lexer. Feel free to use it, replacing the '?'s with the name of your --- lexer: --- --- -- ? LPeg lexer. --- --- local l = require('lexer') --- local token, word_match = l.token, l.word_match --- local P, R, S = lpeg.P, lpeg.R, lpeg.S --- --- local lexer = l.new('?') --- --- -- Whitespace. --- local ws = token(l.WHITESPACE, l.space^1) --- lexer:add_rule('whitespace', ws) --- --- return lexer --- --- The first 3 lines of code simply define often used convenience variables. The --- fourth and last lines [define](#lexer.new) and return the lexer object --- Scintilla uses; they are very important and must be part of every lexer. The --- fifth line defines something called a "token", an essential building block of --- lexers. You will learn about tokens shortly. The sixth line defines a lexer --- grammar rule, which you will learn about later, as well as token styles. --- Note, however, the `local` prefix in front of variables, which is needed --- so-as not to affect Lua's global environment. All in all, this is a minimal, --- working lexer that you can build on. --- --- ### Tokens --- --- Take a moment to think about your programming language's structure. What kind --- of key elements does it have? In the template shown earlier, one predefined --- element all languages have is whitespace. Your language probably also has --- elements like comments, strings, and keywords. Lexers refer to these elements --- as "tokens". Tokens are the fundamental "building blocks" of lexers. Lexers --- break down source code into tokens for coloring, which results in the syntax --- highlighting familiar to you. It is up to you how specific your lexer is when --- it comes to tokens. Perhaps only distinguishing between keywords and --- identifiers is necessary, or maybe recognizing constants and built-in --- functions, methods, or libraries is desirable. The Lua lexer, for example, --- defines 11 tokens: whitespace, keywords, built-in functions, constants, --- built-in libraries, identifiers, strings, comments, numbers, labels, and --- operators. Even though constants, built-in functions, and built-in libraries --- are subsets of identifiers, Lua programmers find it helpful for the lexer to --- distinguish between them all. It is perfectly acceptable to just recognize --- keywords and identifiers. --- --- In a lexer, tokens consist of a token name and an LPeg pattern that matches a --- sequence of characters recognized as an instance of that token. Create tokens --- using the [`lexer.token()`]() function. Let us examine the "whitespace" token --- defined in the template shown earlier: --- --- local ws = token(l.WHITESPACE, l.space^1) --- --- At first glance, the first argument does not appear to be a string name and --- the second argument does not appear to be an LPeg pattern. Perhaps you --- expected something like: --- --- local ws = token('whitespace', S('\t\v\f\n\r ')^1) --- --- The `lexer` (`l`) module actually provides a convenient list of common token --- names and common LPeg patterns for you to use. Token names include --- [`lexer.DEFAULT`](), [`lexer.WHITESPACE`](), [`lexer.COMMENT`](), --- [`lexer.STRING`](), [`lexer.NUMBER`](), [`lexer.KEYWORD`](), --- [`lexer.IDENTIFIER`](), [`lexer.OPERATOR`](), [`lexer.ERROR`](), --- [`lexer.PREPROCESSOR`](), [`lexer.CONSTANT`](), [`lexer.VARIABLE`](), --- [`lexer.FUNCTION`](), [`lexer.CLASS`](), [`lexer.TYPE`](), [`lexer.LABEL`](), --- [`lexer.REGEX`](), and [`lexer.EMBEDDED`](). Patterns include --- [`lexer.any`](), [`lexer.ascii`](), [`lexer.extend`](), [`lexer.alpha`](), --- [`lexer.digit`](), [`lexer.alnum`](), [`lexer.lower`](), [`lexer.upper`](), --- [`lexer.xdigit`](), [`lexer.cntrl`](), [`lexer.graph`](), [`lexer.print`](), --- [`lexer.punct`](), [`lexer.space`](), [`lexer.newline`](), --- [`lexer.nonnewline`](), [`lexer.nonnewline_esc`](), [`lexer.dec_num`](), --- [`lexer.hex_num`](), [`lexer.oct_num`](), [`lexer.integer`](), --- [`lexer.float`](), and [`lexer.word`](). You may use your own token names if --- none of the above fit your language, but an advantage to using predefined --- token names is that your lexer's tokens will inherit the universal syntax --- highlighting color theme used by your text editor. --- --- #### Example Tokens --- --- So, how might you define other tokens like keywords, comments, and strings? --- Here are some examples. --- --- **Keywords** --- --- Instead of matching _n_ keywords with _n_ `P('keyword_`_`n`_`')` ordered --- choices, use another convenience function: [`lexer.word_match()`](). It is --- much easier and more efficient to write word matches like: --- --- local keyword = token(l.KEYWORD, l.word_match[[ --- keyword_1 keyword_2 ... keyword_n --- ]]) --- --- local case_insensitive_keyword = token(l.KEYWORD, l.word_match([[ --- KEYWORD_1 keyword_2 ... KEYword_n --- ]], true)) --- --- local hyphened_keyword = token(l.KEYWORD, l.word_match[[ --- keyword-1 keyword-2 ... keyword-n --- ]]) --- --- **Comments** --- --- Line-style comments with a prefix character(s) are easy to express with LPeg: --- --- local shell_comment = token(l.COMMENT, '#' * l.nonnewline^0) --- local c_line_comment = token(l.COMMENT, '//' * l.nonnewline_esc^0) --- --- The comments above start with a '#' or "//" and go to the end of the line. --- The second comment recognizes the next line also as a comment if the current --- line ends with a '\' escape character. --- --- C-style "block" comments with a start and end delimiter are also easy to --- express: --- --- local c_comment = token(l.COMMENT, '/*' * (l.any - '*/')^0 * P('*/')^-1) --- --- This comment starts with a "/\*" sequence and contains anything up to and --- including an ending "\*/" sequence. The ending "\*/" is optional so the lexer --- can recognize unfinished comments as comments and highlight them properly. --- --- **Strings** --- --- It is tempting to think that a string is not much different from the block --- comment shown above in that both have start and end delimiters: --- --- local dq_str = '"' * (l.any - '"')^0 * P('"')^-1 --- local sq_str = "'" * (l.any - "'")^0 * P("'")^-1 --- local simple_string = token(l.STRING, dq_str + sq_str) --- --- However, most programming languages allow escape sequences in strings such --- that a sequence like "\\"" in a double-quoted string indicates that the --- '"' is not the end of the string. The above token incorrectly matches --- such a string. Instead, use the [`lexer.delimited_range()`]() convenience --- function. --- --- local dq_str = l.delimited_range('"') --- local sq_str = l.delimited_range("'") --- local string = token(l.STRING, dq_str + sq_str) --- --- In this case, the lexer treats '\' as an escape character in a string --- sequence. --- --- **Numbers** --- --- Most programming languages have the same format for integer and float tokens, --- so it might be as simple as using a couple of predefined LPeg patterns: --- --- local number = token(l.NUMBER, l.float + l.integer) --- --- However, some languages allow postfix characters on integers. --- --- local integer = P('-')^-1 * (l.dec_num * S('lL')^-1) --- local number = token(l.NUMBER, l.float + l.hex_num + integer) --- --- Your language may need other tweaks, but it is up to you how fine-grained you --- want your highlighting to be. After all, you are not writing a compiler or --- interpreter! --- --- ### Rules --- --- Programming languages have grammars, which specify valid token structure. For --- example, comments usually cannot appear within a string. Grammars consist of --- rules, which are simply combinations of tokens. Recall from the lexer --- template the [`lexer:add_rule()`]() call, which adds a rule to the lexer's --- grammar: --- --- lexer:add_rule('whitespace', ws) --- --- Each rule has an associated name, but rule names are completely arbitrary and --- serve only to identify and distinguish between different rules. Rule order is --- important: if text does not match the first rule added to the grammar, the --- lexer tries to match the second rule added, and so on. Right now this lexer --- simply matches whitespace tokens under a rule named "whitespace". --- --- To illustrate the importance of rule order, here is an example of a --- simplified Lua lexer: --- --- lexer:add_rule('whitespace', token(l.WHITESPACE, ...)) --- lexer:add_rule('keyword', token(l.KEYWORD, ...)) --- lexer:add_rule('identifier', token(l.IDENTIFIER, ...)) --- lexer:add_rule('string', token(l.STRING, ...)) --- lexer:add_rule('comment', token(l.COMMENT, ...)) --- lexer:add_rule('number', token(l.NUMBER, ...)) --- lexer:add_rule('label', token(l.LABEL, ...)) --- lexer:add_rule('operator', token(l.OPERATOR, ...)) --- --- Note how identifiers come after keywords. In Lua, as with most programming --- languages, the characters allowed in keywords and identifiers are in the same --- set (alphanumerics plus underscores). If the lexer added the "identifier" --- rule before the "keyword" rule, all keywords would match identifiers and thus --- incorrectly highlight as identifiers instead of keywords. The same idea --- applies to function, constant, etc. tokens that you may want to distinguish --- between: their rules should come before identifiers. --- --- So what about text that does not match any rules? For example in Lua, the '!' --- character is meaningless outside a string or comment. Normally the lexer --- skips over such text. If instead you want to highlight these "syntax errors", --- add an additional end rule: --- --- lexer:add_rule('whitespace', ws) --- ... --- lexer:add_rule('error', token(l.ERROR, l.any)) --- --- This identifies and highlights any character not matched by an existing --- rule as a `lexer.ERROR` token. --- --- Even though the rules defined in the examples above contain a single token, --- rules may consist of multiple tokens. For example, a rule for an HTML tag --- could consist of a tag token followed by an arbitrary number of attribute --- tokens, allowing the lexer to highlight all tokens separately. That rule --- might look something like this: --- --- lexer:add_rule('tag', tag_start * (ws * attributes)^0 * tag_end^-1) --- --- Note however that lexers with complex rules like these are more prone to lose --- track of their state, especially if they span multiple lines. --- --- ### Summary --- --- Lexers primarily consist of tokens and grammar rules. At your disposal are a --- number of convenience patterns and functions for rapidly creating a lexer. If --- you choose to use predefined token names for your tokens, you do not have to --- define how the lexer highlights them. The tokens will inherit the default --- syntax highlighting color theme your editor uses. --- --- ## Advanced Techniques --- --- ### Styles and Styling --- --- The most basic form of syntax highlighting is assigning different colors to --- different tokens. Instead of highlighting with just colors, Scintilla allows --- for more rich highlighting, or "styling", with different fonts, font sizes, --- font attributes, and foreground and background colors, just to name a few. --- The unit of this rich highlighting is called a "style". Styles are simply --- strings of comma-separated property settings. By default, lexers associate --- predefined token names like `lexer.WHITESPACE`, `lexer.COMMENT`, --- `lexer.STRING`, etc. with particular styles as part of a universal color --- theme. These predefined styles include [`lexer.STYLE_CLASS`](), --- [`lexer.STYLE_COMMENT`](), [`lexer.STYLE_CONSTANT`](), --- [`lexer.STYLE_ERROR`](), [`lexer.STYLE_EMBEDDED`](), --- [`lexer.STYLE_FUNCTION`](), [`lexer.STYLE_IDENTIFIER`](), --- [`lexer.STYLE_KEYWORD`](), [`lexer.STYLE_LABEL`](), [`lexer.STYLE_NUMBER`](), --- [`lexer.STYLE_OPERATOR`](), [`lexer.STYLE_PREPROCESSOR`](), --- [`lexer.STYLE_REGEX`](), [`lexer.STYLE_STRING`](), [`lexer.STYLE_TYPE`](), --- [`lexer.STYLE_VARIABLE`](), and [`lexer.STYLE_WHITESPACE`](). Like with --- predefined token names and LPeg patterns, you may define your own styles. At --- their core, styles are just strings, so you may create new ones and/or modify --- existing ones. Each style consists of the following comma-separated settings: --- --- Setting | Description --- ---------------|------------ --- font:_name_ | The name of the font the style uses. --- size:_int_ | The size of the font the style uses. --- [not]bold | Whether or not the font face is bold. --- weight:_int_ | The weight or boldness of a font, between 1 and 999. --- [not]italics | Whether or not the font face is italic. --- [not]underlined| Whether or not the font face is underlined. --- fore:_color_ | The foreground color of the font face. --- back:_color_ | The background color of the font face. --- [not]eolfilled | Does the background color extend to the end of the line? --- case:_char_ | The case of the font ('u': upper, 'l': lower, 'm': normal). --- [not]visible | Whether or not the text is visible. --- [not]changeable| Whether the text is changeable or read-only. --- --- Specify font colors in either "#RRGGBB" format, "0xBBGGRR" format, or the --- decimal equivalent of the latter. As with token names, LPeg patterns, and --- styles, there is a set of predefined color names, but they vary depending on --- the current color theme in use. Therefore, it is generally not a good idea to --- manually define colors within styles in your lexer since they might not fit --- into a user's chosen color theme. Try to refrain from even using predefined --- colors in a style because that color may be theme-specific. Instead, the best --- practice is to either use predefined styles or derive new color-agnostic --- styles from predefined ones. For example, Lua "longstring" tokens use the --- existing `lexer.STYLE_STRING` style instead of defining a new one. --- --- #### Example Styles --- --- Defining styles is pretty straightforward. An empty style that inherits the --- default theme settings is simply an empty string: --- --- local style_nothing = '' --- --- A similar style but with a bold font face looks like this: --- --- local style_bold = 'bold' --- --- If you want the same style, but also with an italic font face, define the new --- style in terms of the old one: --- --- local style_bold_italic = style_bold..',italics' --- --- This allows you to derive new styles from predefined ones without having to --- rewrite them. This operation leaves the old style unchanged. Thus if you --- had a "static variable" token whose style you wanted to base off of --- `lexer.STYLE_VARIABLE`, it would probably look like: --- --- local style_static_var = l.STYLE_VARIABLE..',italics' --- --- The color theme files in the *lexers/themes/* folder give more examples of --- style definitions. --- --- ### Token Styles --- --- Lexers use the [`lexer:add_style()`]() function to assign styles to --- particular tokens. Recall the token definition and from the lexer template: --- --- local ws = token(l.WHITESPACE, l.space^1) --- lexer:add_rule('whitespace', ws) --- --- Why is a style not assigned to the `lexer.WHITESPACE` token? As mentioned --- earlier, lexers automatically associate tokens that use predefined token --- names with a particular style. Only tokens with custom token names need --- manual style associations. As an example, consider a custom whitespace token: --- --- local ws = token('custom_whitespace', l.space^1) --- --- Assigning a style to this token looks like: --- --- lexer:add_style('custom_whitespace', l.STYLE_WHITESPACE) --- --- Do not confuse token names with rule names. They are completely different --- entities. In the example above, the lexer associates the "custom_whitespace" --- token with the existing style for `lexer.WHITESPACE` tokens. If instead you --- prefer to color the background of whitespace a shade of grey, it might look --- like: --- --- local custom_style = l.STYLE_WHITESPACE..',back:$(color.grey)' --- lexer:add_style('custom_whitespace', custom_style) --- --- Notice that the lexer peforms Scintilla/SciTE-style "$()" property expansion. --- You may also use "%()". Remember to refrain from assigning specific colors in --- styles, but in this case, all user color themes probably define the --- "color.grey" property. --- --- ### Line Lexers --- --- By default, lexers match the arbitrary chunks of text passed to them by --- Scintilla. These chunks may be a full document, only the visible part of a --- document, or even just portions of lines. Some lexers need to match whole --- lines. For example, a lexer for the output of a file "diff" needs to know if --- the line started with a '+' or '-' and then style the entire line --- accordingly. To indicate that your lexer matches by line, create the lexer --- with an extra parameter: --- --- local lexer = l.new('?', {lex_by_line = true}) --- --- Now the input text for the lexer is a single line at a time. Keep in mind --- that line lexers do not have the ability to look ahead at subsequent lines. --- --- ### Embedded Lexers --- --- Lexers embed within one another very easily, requiring minimal effort. In the --- following sections, the lexer being embedded is called the "child" lexer and --- the lexer a child is being embedded in is called the "parent". For example, --- consider an HTML lexer and a CSS lexer. Either lexer stands alone for styling --- their respective HTML and CSS files. However, CSS can be embedded inside --- HTML. In this specific case, the CSS lexer is the "child" lexer with the HTML --- lexer being the "parent". Now consider an HTML lexer and a PHP lexer. This --- sounds a lot like the case with CSS, but there is a subtle difference: PHP --- _embeds itself into_ HTML while CSS is _embedded in_ HTML. This fundamental --- difference results in two types of embedded lexers: a parent lexer that --- embeds other child lexers in it (like HTML embedding CSS), and a child lexer --- that embeds itself into a parent lexer (like PHP embedding itself in HTML). --- --- #### Parent Lexer --- --- Before embedding a child lexer into a parent lexer, the parent lexer needs to --- load the child lexer. This is done with the [`lexer.load()`]() function. For --- example, loading the CSS lexer within the HTML lexer looks like: --- --- local css = l.load('css') --- --- The next part of the embedding process is telling the parent lexer when to --- switch over to the child lexer and when to switch back. The lexer refers to --- these indications as the "start rule" and "end rule", respectively, and are --- just LPeg patterns. Continuing with the HTML/CSS example, the transition from --- HTML to CSS is when the lexer encounters a "style" tag with a "type" --- attribute whose value is "text/css": --- --- local css_tag = P('<style') * P(function(input, index) --- if input:find('^[^>]+type="text/css"', index) then --- return index --- end --- end) --- --- This pattern looks for the beginning of a "style" tag and searches its --- attribute list for the text "`type="text/css"`". (In this simplified example, --- the Lua pattern does not consider whitespace between the '=' nor does it --- consider that using single quotes is valid.) If there is a match, the --- functional pattern returns a value instead of `nil`. In this case, the value --- returned does not matter because we ultimately want to style the "style" tag --- as an HTML tag, so the actual start rule looks like this: --- --- local css_start_rule = #css_tag * tag --- --- Now that the parent knows when to switch to the child, it needs to know when --- to switch back. In the case of HTML/CSS, the switch back occurs when the --- lexer encounters an ending "style" tag, though the lexer should still style --- the tag as an HTML tag: --- --- local css_end_rule = #P('</style>') * tag --- --- Once the parent loads the child lexer and defines the child's start and end --- rules, it embeds the child with the [`lexer:embed()`]() function: --- --- lexer:embed(css, css_start_rule, css_end_rule) --- --- #### Child Lexer --- --- The process for instructing a child lexer to embed itself into a parent is --- very similar to embedding a child into a parent: first, load the parent lexer --- into the child lexer with the [`lexer.load()`]() function and then create --- start and end rules for the child lexer. However, in this case, call --- [`lexer:embed()`]() with switched arguments. For example, in the PHP lexer: --- --- local html = l.load('html') --- local php_start_rule = token('php_tag', '<?php ') --- local php_end_rule = token('php_tag', '?>') --- lexer:add_style('php_tag', l.STYLE_EMBEDDED) --- html:embed(lexer, php_start_rule, php_end_rule) --- --- ### Lexers with Complex State --- --- A vast majority of lexers are not stateful and can operate on any chunk of --- text in a document. However, there may be rare cases where a lexer does need --- to keep track of some sort of persistent state. Rather than using `lpeg.P` --- function patterns that set state variables, it is recommended to make use of --- Scintilla's built-in, per-line state integers via [`lexer.line_state`](). It --- was designed to accommodate up to 32 bit flags for tracking state. --- [`lexer.line_from_position()`]() will return the line for any position given --- to an `lpeg.P` function pattern. (Any positions derived from that position --- argument will also work.) --- --- Writing stateful lexers is beyond the scope of this document. --- --- ## Code Folding --- --- When reading source code, it is occasionally helpful to temporarily hide --- blocks of code like functions, classes, comments, etc. This is the concept of --- "folding". In the Textadept and SciTE editors for example, little indicators --- in the editor margins appear next to code that can be folded at places called --- "fold points". When the user clicks an indicator, the editor hides the code --- associated with the indicator until the user clicks the indicator again. The --- lexer specifies these fold points and what code exactly to fold. --- --- The fold points for most languages occur on keywords or character sequences. --- Examples of fold keywords are "if" and "end" in Lua and examples of fold --- character sequences are '{', '}', "/\*", and "\*/" in C for code block and --- comment delimiters, respectively. However, these fold points cannot occur --- just anywhere. For example, lexers should not recognize fold keywords that --- appear within strings or comments. The [`lexer:add_fold_point()`]() function --- allows you to conveniently define fold points with such granularity. For --- example, consider C: --- --- lexer:add_fold_point(l.OPERATOR, '{', '}') --- lexer:add_fold_point(l.COMMENT, '/*', '*/') --- --- The first assignment states that any '{' or '}' that the lexer recognized as --- an `lexer.OPERATOR` token is a fold point. Likewise, the second assignment --- states that any "/\*" or "\*/" that the lexer recognizes as part of a --- `lexer.COMMENT` token is a fold point. The lexer does not consider any --- occurrences of these characters outside their defined tokens (such as in a --- string) as fold points. How do you specify fold keywords? Here is an example --- for Lua: --- --- lexer:add_fold_point(l.KEYWORD, 'if', 'end') --- lexer:add_fold_point(l.KEYWORD, 'do', 'end') --- lexer:add_fold_point(l.KEYWORD, 'function', 'end') --- lexer:add_fold_point(l.KEYWORD, 'repeat', 'until') --- --- If your lexer has case-insensitive keywords as fold points, simply add a --- `case_insensitive_fold_points = true` option to [`lexer.new()`](), and --- specify keywords in lower case. --- --- If your lexer needs to do some additional processing in order to determine if --- a token is a fold point, pass a function that returns an integer to --- `lexer:add_fold_point()`. Returning `1` indicates the token is a beginning --- fold point and returning `-1` indicates the token is an ending fold point. --- Returning `0` indicates the token is not a fold point. For example: --- --- local function fold_strange_token(text, pos, line, s, symbol) --- if ... then --- return 1 -- beginning fold point --- elseif ... then --- return -1 -- ending fold point --- end --- return 0 --- end --- --- lexer:add_fold_point('strange_token', '|', fold_strange_token) --- --- Any time the lexer encounters a '|' that is a "strange_token", it calls the --- `fold_strange_token` function to determine if '|' is a fold point. The lexer --- calls these functions with the following arguments: the text to identify fold --- points in, the beginning position of the current line in the text to fold, --- the current line's text, the position in the current line the fold point text --- starts at, and the fold point text itself. --- --- ### Fold by Indentation --- --- Some languages have significant whitespace and/or no delimiters that indicate --- fold points. If your lexer falls into this category and you would like to --- mark fold points based on changes in indentation, create the lexer with a --- `fold_by_indentation = true` option: --- --- local lexer = l.new('?', {fold_by_indentation = true}) --- --- ## Using Lexers --- --- ### Textadept --- --- Put your lexer in your *~/.textadept/lexers/* directory so you do not --- overwrite it when upgrading Textadept. Also, lexers in this directory --- override default lexers. Thus, Textadept loads a user *lua* lexer instead of --- the default *lua* lexer. This is convenient for tweaking a default lexer to --- your liking. Then add a [file type][] for your lexer if necessary. --- --- [file type]: textadept.file_types.html --- --- ### SciTE --- --- Create a *.properties* file for your lexer and `import` it in either your --- *SciTEUser.properties* or *SciTEGlobal.properties*. The contents of the --- *.properties* file should contain: --- --- file.patterns.[lexer_name]=[file_patterns] --- lexer.$(file.patterns.[lexer_name])=[lexer_name] --- --- where `[lexer_name]` is the name of your lexer (minus the *.lua* extension) --- and `[file_patterns]` is a set of file extensions to use your lexer for. --- --- Please note that Lua lexers ignore any styling information in *.properties* --- files. Your theme file in the *lexers/themes/* directory contains styling --- information. --- --- ## Considerations --- --- ### Performance --- --- There might be some slight overhead when initializing a lexer, but loading a --- file from disk into Scintilla is usually more expensive. On modern computer --- systems, I see no difference in speed between LPeg lexers and Scintilla's C++ --- ones. Optimize lexers for speed by re-arranging `lexer:add_rule()` calls so --- that the most common rules match first. Do keep in mind that order matters --- for similar rules. --- --- In some cases, folding may be far more expensive than lexing, particularly --- in lexers with a lot of potential fold points. If your lexer is exhibiting --- signs of slowness, try disabling folding your text editor first. If that --- speeds things up, you can try reducing the number of fold points you added, --- overriding `lexer:fold()` with your own implementation, or simply eliminating --- folding support from your lexer. --- --- ### Limitations --- --- Embedded preprocessor languages like PHP cannot completely embed in their --- parent languages in that the parent's tokens do not support start and end --- rules. This mostly goes unnoticed, but code like --- --- <div id="<?php echo $id; ?>"> --- --- will not style correctly. --- --- ### Troubleshooting --- --- Errors in lexers can be tricky to debug. Lexers print Lua errors to --- `io.stderr` and `_G.print()` statements to `io.stdout`. Running your editor --- from a terminal is the easiest way to see errors as they occur. --- --- ### Risks --- --- Poorly written lexers have the ability to crash Scintilla (and thus its --- containing application), so unsaved data might be lost. However, I have only --- observed these crashes in early lexer development, when syntax errors or --- pattern errors are present. Once the lexer actually starts styling text --- (either correctly or incorrectly, it does not matter), I have not observed --- any crashes. --- --- ### Acknowledgements --- --- Thanks to Peter Odding for his [lexer post][] on the Lua mailing list --- that inspired me, and thanks to Roberto Ierusalimschy for LPeg. --- --- [lexer post]: http://lua-users.org/lists/lua-l/2007-04/msg00116.html --- @field path (string) --- The path used to search for a lexer to load. --- Identical in format to Lua's `package.path` string. --- The default value is `package.path`. --- @field DEFAULT (string) --- The token name for default tokens. --- @field WHITESPACE (string) --- The token name for whitespace tokens. --- @field COMMENT (string) --- The token name for comment tokens. --- @field STRING (string) --- The token name for string tokens. --- @field NUMBER (string) --- The token name for number tokens. --- @field KEYWORD (string) --- The token name for keyword tokens. --- @field IDENTIFIER (string) --- The token name for identifier tokens. --- @field OPERATOR (string) --- The token name for operator tokens. --- @field ERROR (string) --- The token name for error tokens. --- @field PREPROCESSOR (string) --- The token name for preprocessor tokens. --- @field CONSTANT (string) --- The token name for constant tokens. --- @field VARIABLE (string) --- The token name for variable tokens. --- @field FUNCTION (string) --- The token name for function tokens. --- @field CLASS (string) --- The token name for class tokens. --- @field TYPE (string) --- The token name for type tokens. --- @field LABEL (string) --- The token name for label tokens. --- @field REGEX (string) --- The token name for regex tokens. --- @field STYLE_CLASS (string) --- The style typically used for class definitions. --- @field STYLE_COMMENT (string) --- The style typically used for code comments. --- @field STYLE_CONSTANT (string) --- The style typically used for constants. --- @field STYLE_ERROR (string) --- The style typically used for erroneous syntax. --- @field STYLE_FUNCTION (string) --- The style typically used for function definitions. --- @field STYLE_KEYWORD (string) --- The style typically used for language keywords. --- @field STYLE_LABEL (string) --- The style typically used for labels. --- @field STYLE_NUMBER (string) --- The style typically used for numbers. --- @field STYLE_OPERATOR (string) --- The style typically used for operators. --- @field STYLE_REGEX (string) --- The style typically used for regular expression strings. --- @field STYLE_STRING (string) --- The style typically used for strings. --- @field STYLE_PREPROCESSOR (string) --- The style typically used for preprocessor statements. --- @field STYLE_TYPE (string) --- The style typically used for static types. --- @field STYLE_VARIABLE (string) --- The style typically used for variables. --- @field STYLE_WHITESPACE (string) --- The style typically used for whitespace. --- @field STYLE_EMBEDDED (string) --- The style typically used for embedded code. --- @field STYLE_IDENTIFIER (string) --- The style typically used for identifier words. --- @field STYLE_DEFAULT (string) --- The style all styles are based off of. --- @field STYLE_LINENUMBER (string) --- The style used for all margins except fold margins. --- @field STYLE_BRACELIGHT (string) --- The style used for highlighted brace characters. --- @field STYLE_BRACEBAD (string) --- The style used for unmatched brace characters. --- @field STYLE_CONTROLCHAR (string) --- The style used for control characters. --- Color attributes are ignored. --- @field STYLE_INDENTGUIDE (string) --- The style used for indentation guides. --- @field STYLE_CALLTIP (string) --- The style used by call tips if [`buffer.call_tip_use_style`]() is set. --- Only the font name, size, and color attributes are used. --- @field STYLE_FOLDDISPLAYTEXT (string) --- The style used for fold display text. --- @field any (pattern) --- A pattern that matches any single character. --- @field ascii (pattern) --- A pattern that matches any ASCII character (codes 0 to 127). --- @field extend (pattern) --- A pattern that matches any ASCII extended character (codes 0 to 255). --- @field alpha (pattern) --- A pattern that matches any alphabetic character ('A'-'Z', 'a'-'z'). --- @field digit (pattern) --- A pattern that matches any digit ('0'-'9'). --- @field alnum (pattern) --- A pattern that matches any alphanumeric character ('A'-'Z', 'a'-'z', --- '0'-'9'). --- @field lower (pattern) --- A pattern that matches any lower case character ('a'-'z'). --- @field upper (pattern) --- A pattern that matches any upper case character ('A'-'Z'). --- @field xdigit (pattern) --- A pattern that matches any hexadecimal digit ('0'-'9', 'A'-'F', 'a'-'f'). --- @field cntrl (pattern) --- A pattern that matches any control character (ASCII codes 0 to 31). --- @field graph (pattern) --- A pattern that matches any graphical character ('!' to '~'). --- @field print (pattern) --- A pattern that matches any printable character (' ' to '~'). --- @field punct (pattern) --- A pattern that matches any punctuation character ('!' to '/', ':' to '@', --- '[' to ''', '{' to '~'). --- @field space (pattern) --- A pattern that matches any whitespace character ('\t', '\v', '\f', '\n', --- '\r', space). --- @field newline (pattern) --- A pattern that matches any set of end of line characters. --- @field nonnewline (pattern) --- A pattern that matches any single, non-newline character. --- @field nonnewline_esc (pattern) --- A pattern that matches any single, non-newline character or any set of end --- of line characters escaped with '\'. --- @field dec_num (pattern) --- A pattern that matches a decimal number. --- @field hex_num (pattern) --- A pattern that matches a hexadecimal number. --- @field oct_num (pattern) --- A pattern that matches an octal number. --- @field integer (pattern) --- A pattern that matches either a decimal, hexadecimal, or octal number. --- @field float (pattern) --- A pattern that matches a floating point number. --- @field word (pattern) --- A pattern that matches a typical word. Words begin with a letter or --- underscore and consist of alphanumeric and underscore characters. --- @field FOLD_BASE (number) --- The initial (root) fold level. --- @field FOLD_BLANK (number) --- Flag indicating that the line is blank. --- @field FOLD_HEADER (number) --- Flag indicating the line is fold point. --- @field fold_level (table, Read-only) --- Table of fold level bit-masks for line numbers starting from zero. --- Fold level masks are composed of an integer level combined with any of the --- following bits: --- --- * `lexer.FOLD_BASE` --- The initial fold level. --- * `lexer.FOLD_BLANK` --- The line is blank. --- * `lexer.FOLD_HEADER` --- The line is a header, or fold point. --- @field indent_amount (table, Read-only) --- Table of indentation amounts in character columns, for line numbers --- starting from zero. --- @field line_state (table) --- Table of integer line states for line numbers starting from zero. --- Line states can be used by lexers for keeping track of persistent states. --- @field property (table) --- Map of key-value string pairs. --- @field property_expanded (table, Read-only) --- Map of key-value string pairs with `$()` and `%()` variable replacement --- performed in values. --- @field property_int (table, Read-only) --- Map of key-value pairs with values interpreted as numbers, or `0` if not --- found. --- @field style_at (table, Read-only) --- Table of style names at positions in the buffer starting from 1. -module('lexer')]=] - -local lpeg = require('lpeg') -local lpeg_P, lpeg_R, lpeg_S, lpeg_V = lpeg.P, lpeg.R, lpeg.S, lpeg.V -local lpeg_Ct, lpeg_Cc, lpeg_Cp = lpeg.Ct, lpeg.Cc, lpeg.Cp -local lpeg_Cmt, lpeg_C = lpeg.Cmt, lpeg.C -local lpeg_match = lpeg.match - -M.path = package.path - -if not package.searchpath then - -- Searches for the given *name* in the given *path*. - -- This is an implementation of Lua 5.2's `package.searchpath()` function for - -- Lua 5.1. - function package.searchpath(name, path) - local tried = {} - for part in path:gmatch('[^;]+') do - local filename = part:gsub('%?', name) - local f = io.open(filename, 'r') - if f then - f:close() - return filename - end - tried[#tried + 1] = string.format("no file '%s'", filename) - end - return nil, table.concat(tried, '\n') - end -end - -local string_upper = string.upper --- Default styles. -local default = { - 'nothing', 'whitespace', 'comment', 'string', 'number', 'keyword', - 'identifier', 'operator', 'error', 'preprocessor', 'constant', 'variable', - 'function', 'class', 'type', 'label', 'regex', 'embedded' -} -for i = 1, #default do - local name, upper_name = default[i], string_upper(default[i]) - M[upper_name], M['STYLE_'..upper_name] = name, '$(style.'..name..')' -end --- Predefined styles. -local predefined = { - 'default', 'linenumber', 'bracelight', 'bracebad', 'controlchar', - 'indentguide', 'calltip', 'folddisplaytext' -} -for i = 1, #predefined do - local name, upper_name = predefined[i], string_upper(predefined[i]) - M[upper_name], M['STYLE_'..upper_name] = name, '$(style.'..name..')' -end - ---- --- Adds pattern *rule* identified by string *id* to the ordered list of rules --- for lexer *lexer*. --- @param lexer The lexer to add the given rule to. --- @param id The id associated with this rule. It does not have to be the same --- as the name passed to `token()`. --- @param rule The LPeg pattern of the rule. --- @see modify_rule --- @name add_rule -function M.add_rule(lexer, id, rule) - if lexer._lexer then lexer = lexer._lexer end -- proxy; get true parent - if not lexer._RULES then - lexer._RULES = {} - -- Contains an ordered list (by numerical index) of rule names. This is used - -- in conjunction with lexer._RULES for building _TOKENRULE. - lexer._RULEORDER = {} - end - lexer._RULES[id] = rule - lexer._RULEORDER[#lexer._RULEORDER + 1] = id - lexer:build_grammar() -end - ---- --- Replaces in lexer *lexer* the existing rule identified by string *id* with --- pattern *rule*. --- @param lexer The lexer to modify. --- @param id The id associated with this rule. --- @param rule The LPeg pattern of the rule. --- @name modify_rule -function M.modify_rule(lexer, id, rule) - if lexer._lexer then lexer = lexer._lexer end -- proxy; get true parent - lexer._RULES[id] = rule - lexer:build_grammar() -end - ---- --- Associates string *token_name* in lexer *lexer* with Scintilla style string --- *style*. --- Style strings are comma-separated property settings. Available property --- settings are: --- --- * `font:name`: Font name. --- * `size:int`: Font size. --- * `bold` or `notbold`: Whether or not the font face is bold. --- * `weight:int`: Font weight (between 1 and 999). --- * `italics` or `notitalics`: Whether or not the font face is italic. --- * `underlined` or `notunderlined`: Whether or not the font face is --- underlined. --- * `fore:color`: Font face foreground color in "#RRGGBB" or 0xBBGGRR format. --- * `back:color`: Font face background color in "#RRGGBB" or 0xBBGGRR format. --- * `eolfilled` or `noteolfilled`: Whether or not the background color --- extends to the end of the line. --- * `case:char`: Font case ('u' for uppercase, 'l' for lowercase, and 'm' for --- mixed case). --- * `visible` or `notvisible`: Whether or not the text is visible. --- * `changeable` or `notchangeable`: Whether or not the text is changeable or --- read-only. --- --- Property settings may also contain "$(property.name)" expansions for --- properties defined in Scintilla, theme files, etc. --- @param lexer The lexer to add a style to. --- @param token_name The name of the token to associated with the style. --- @param style A style string for Scintilla. --- @usage lexer:add_style('longstring', l.STYLE_STRING) --- @usage lexer:add_style('deprecated_function', l.STYLE_FUNCTION..',italics') --- @usage lexer:add_style('visible_ws', --- l.STYLE_WHITESPACE..',back:$(color.grey)') --- @name add_style -function M.add_style(lexer, token_name, style) - local num_styles = lexer._numstyles - if num_styles == 32 then num_styles = num_styles + 8 end -- skip predefined - if num_styles >= 255 then print('Too many styles defined (255 MAX)') end - lexer._TOKENSTYLES[token_name], lexer._numstyles = num_styles, num_styles + 1 - lexer._EXTRASTYLES[token_name] = style - -- If the lexer is a proxy or a child that embedded itself, copy this style to - -- the parent lexer. - if lexer._lexer then lexer._lexer:add_style(token_name, style) end -end - ---- --- Adds to lexer *lexer* a fold point whose beginning and end tokens are string --- *token_name* tokens with string content *start_symbol* and *end_symbol*, --- respectively. --- In the event that *start_symbol* may or may not be a fold point depending on --- context, and that additional processing is required, *end_symbol* may be a --- function that ultimately returns `1` (indicating a beginning fold point), --- `-1` (indicating an ending fold point), or `0` (indicating no fold point). --- That function is passed the following arguments: --- --- * `text`: The text being processed for fold points. --- * `pos`: The position in *text* of the beginning of the line currently --- being processed. --- * `line`: The text of the line currently being processed. --- * `s`: The position of *start_symbol* in *line*. --- * `symbol`: *start_symbol* itself. --- @param lexer The lexer to add a fold point to. --- @param token_name The token name of text that indicates a fold point. --- @param start_symbol The text that indicates the beginning of a fold point. --- @param end_symbol Either the text that indicates the end of a fold point, or --- a function that returns whether or not *start_symbol* is a beginning fold --- point (1), an ending fold point (-1), or not a fold point at all (0). --- @usage lexer:add_fold_point(l.OPERATOR, '{', '}') --- @usage lexer:add_fold_point(l.KEYWORD, 'if', 'end') --- @usage lexer:add_fold_point(l.COMMENT, '#', l.fold_line_comments('#')) --- @usage lexer:add_fold_point('custom', function(text, pos, line, s, symbol) --- ... end) --- @name add_fold_point -function M.add_fold_point(lexer, token_name, start_symbol, end_symbol) - if not lexer._FOLDPOINTS then lexer._FOLDPOINTS = {_SYMBOLS = {}} end - local symbols = lexer._FOLDPOINTS._SYMBOLS - if not symbols[start_symbol] then - symbols[#symbols + 1], symbols[start_symbol] = start_symbol, true - end - if not lexer._FOLDPOINTS[token_name] then - lexer._FOLDPOINTS[token_name] = {} - end - if type(end_symbol) == 'string' then - if not symbols[end_symbol] then - symbols[#symbols + 1], symbols[end_symbol] = end_symbol, true - end - lexer._FOLDPOINTS[token_name][start_symbol] = 1 - lexer._FOLDPOINTS[token_name][end_symbol] = -1 - else - lexer._FOLDPOINTS[token_name][start_symbol] = end_symbol -- function or int - end - -- If the lexer is a proxy or a child that embedded itself, copy this fold - -- point to the parent lexer. - if lexer._lexer then - lexer._lexer:add_fold_point(token_name, start_symbol, end_symbol) - end -end - --- (Re)constructs `lexer._TOKENRULE`. -local function join_tokens(lexer) - local patterns, order = lexer._RULES, lexer._RULEORDER - local token_rule = patterns[order[1]] - for i = 2, #order do token_rule = token_rule + patterns[order[i]] end - lexer._TOKENRULE = token_rule + M.token(M.DEFAULT, M.any) - return lexer._TOKENRULE -end - --- Metatable for Scintillua grammars. --- These grammars are just tables ultimately passed to `lpeg.P()`. -local grammar_mt = {__index = { - -- Adds lexer *lexer* and any of its embedded lexers to this grammar. - -- @param lexer The lexer to add. - add_lexer = function(self, lexer) - local token_rule = lexer:join_tokens() - for i = 1, #lexer._CHILDREN do - local child = lexer._CHILDREN[i] - if child._CHILDREN then self:add_lexer(child) end - local rules = child._EMBEDDEDRULES[lexer._NAME] - local rules_token_rule = self['__'..child._NAME] or rules.token_rule - self[child._NAME] = (-rules.end_rule * rules_token_rule)^0 * - rules.end_rule^-1 * lpeg_V(lexer._NAME) - local embedded_child = '_'..child._NAME - self[embedded_child] = rules.start_rule * - (-rules.end_rule * rules_token_rule)^0 * - rules.end_rule^-1 - token_rule = lpeg_V(embedded_child) + token_rule - end - self['__'..lexer._NAME] = token_rule -- can contain embedded lexer rules - self[lexer._NAME] = token_rule^0 - end -}} - --- (Re)constructs `lexer._GRAMMAR`. --- @param initial_rule The name of the rule to start lexing with. The default --- value is `lexer._NAME`. Multilang lexers use this to start with a child --- rule if necessary. -local function build_grammar(lexer, initial_rule) - if not lexer._RULES then return end - if lexer._CHILDREN then - if not initial_rule then initial_rule = lexer._NAME end - local grammar = setmetatable({initial_rule}, grammar_mt) - grammar:add_lexer(lexer) - lexer._INITIALRULE = initial_rule - lexer._GRAMMAR = lpeg_Ct(lpeg_P(grammar)) - else - lexer._GRAMMAR = lpeg_Ct(lexer:join_tokens()^0) - end -end - ---- --- Embeds child lexer *child* in parent lexer *lexer* using patterns --- *start_rule* and *end_rule*, which signal the beginning and end of the --- embedded lexer, respectively. --- @param lexer The parent lexer. --- @param child The child lexer. --- @param start_rule The pattern that signals the beginning of the embedded --- lexer. --- @param end_rule The pattern that signals the end of the embedded lexer. --- @usage html:embed(css, css_start_rule, css_end_rule) --- @usage html:embed(lexer, php_start_rule, php_end_rule) -- from php lexer --- @name embed -function M.embed(lexer, child, start_rule, end_rule) - if lexer._lexer then lexer = lexer._lexer end -- proxy; get true parent - -- Add child rules. - if not child._EMBEDDEDRULES then child._EMBEDDEDRULES = {} end - if not child._RULES then error('Cannot embed lexer with no rules') end - child._EMBEDDEDRULES[lexer._NAME] = { - ['start_rule'] = start_rule, - token_rule = child:join_tokens(), - ['end_rule'] = end_rule - } - if not lexer._CHILDREN then lexer._CHILDREN = {} end - local children = lexer._CHILDREN - children[#children + 1] = child - -- Add child styles. - for token, style in pairs(child._EXTRASTYLES) do - lexer:add_style(token, style) - end - -- Add child fold symbols. - if child._FOLDPOINTS then - for token_name, symbols in pairs(child._FOLDPOINTS) do - if token_name ~= '_SYMBOLS' then - for symbol, v in pairs(symbols) do - lexer:add_fold_point(token_name, symbol, v) - end - end - end - end - lexer:build_grammar() - child._lexer = lexer -- use parent's tokens if child is embedding itself -end - ---- --- Lexes a chunk of text *text* (that has an initial style number of --- *init_style*) using lexer *lexer*, returning a table of token names and --- positions. --- @param lexer The lexer to lex text with. --- @param text The text in the buffer to lex. --- @param init_style The current style. Multiple-language lexers use this to --- determine which language to start lexing in. --- @return table of token names and positions. --- @name lex -function M.lex(lexer, text, init_style) - if not lexer._GRAMMAR then return {M.DEFAULT, #text + 1} end - if not lexer._LEXBYLINE then - -- For multilang lexers, build a new grammar whose initial_rule is the - -- current language. - if lexer._CHILDREN then - for style, style_num in pairs(lexer._TOKENSTYLES) do - if style_num == init_style then - local lexer_name = style:match('^(.+)_whitespace') or lexer._NAME - if lexer._INITIALRULE ~= lexer_name then - lexer:build_grammar(lexer_name) - end - break - end - end - end - return lpeg_match(lexer._GRAMMAR, text) - else - local tokens = {} - local function append(tokens, line_tokens, offset) - for i = 1, #line_tokens, 2 do - tokens[#tokens + 1] = line_tokens[i] - tokens[#tokens + 1] = line_tokens[i + 1] + offset - end - end - local offset = 0 - local grammar = lexer._GRAMMAR - for line in text:gmatch('[^\r\n]*\r?\n?') do - local line_tokens = lpeg_match(grammar, line) - if line_tokens then append(tokens, line_tokens, offset) end - offset = offset + #line - -- Use the default style to the end of the line if none was specified. - if tokens[#tokens] ~= offset then - tokens[#tokens + 1], tokens[#tokens + 2] = 'default', offset + 1 - end - end - return tokens - end -end - ---- --- Determines fold points in a chunk of text *text* using lexer *lexer*, --- returning a table of fold levels associated with line numbers. --- *text* starts at position *start_pos* on line number *start_line* with a --- beginning fold level of *start_level* in the buffer. --- @param lexer The lexer to fold text with. --- @param text The text in the buffer to fold. --- @param start_pos The position in the buffer *text* starts at, starting at --- zero. --- @param start_line The line number *text* starts on. --- @param start_level The fold level *text* starts on. --- @return table of fold levels associated with line numbers. --- @name fold -function M.fold(lexer, text, start_pos, start_line, start_level) - local folds = {} - if text == '' then return folds end - local fold = M.property_int['fold'] > 0 - local FOLD_BASE = M.FOLD_BASE - local FOLD_HEADER, FOLD_BLANK = M.FOLD_HEADER, M.FOLD_BLANK - if fold and lexer._FOLDPOINTS then - local lines = {} - for p, l in (text..'\n'):gmatch('()(.-)\r?\n') do - lines[#lines + 1] = {p, l} - end - local fold_zero_sum_lines = M.property_int['fold.on.zero.sum.lines'] > 0 - local fold_points = lexer._FOLDPOINTS - local fold_point_symbols = fold_points._SYMBOLS - local style_at, fold_level = M.style_at, M.fold_level - local line_num, prev_level = start_line, start_level - local current_level = prev_level - for i = 1, #lines do - local pos, line = lines[i][1], lines[i][2] - if line ~= '' then - if lexer._CASEINSENSITIVEFOLDPOINTS then line = line:lower() end - local level_decreased = false - for j = 1, #fold_point_symbols do - local symbol = fold_point_symbols[j] - local word = not symbol:find('[^%w_]') - local s, e = line:find(symbol, 1, true) - while s and e do - --if not word or line:find('^%f[%w_]'..symbol..'%f[^%w_]', s) then - if not word or not ((s > 1 and line:find('^[%w_]', s - 1)) or - line:find('^[%w_]', e + 1)) then - local symbols = fold_points[style_at[start_pos + pos + s - 1]] - local level = symbols and symbols[symbol] - if type(level) == 'function' then - level = level(text, pos, line, s, symbol) - end - if type(level) == 'number' then - current_level = current_level + level - if level < 0 and current_level < prev_level then - -- Potential zero-sum line. If the level were to go back up on - -- the same line, the line may be marked as a fold header. - level_decreased = true - end - end - end - s = line:find(fold_point_symbols[j], s + 1, true) - end - end - folds[line_num] = prev_level - if current_level > prev_level then - folds[line_num] = prev_level + FOLD_HEADER - elseif level_decreased and current_level == prev_level and - fold_zero_sum_lines then - if line_num > start_line then - folds[line_num] = prev_level - 1 + FOLD_HEADER - else - -- Typing within a zero-sum line. - local level = fold_level[line_num - 1] - 1 - if level > FOLD_HEADER then level = level - FOLD_HEADER end - if level > FOLD_BLANK then level = level - FOLD_BLANK end - folds[line_num] = level + FOLD_HEADER - current_level = current_level + 1 - end - end - if current_level < FOLD_BASE then current_level = FOLD_BASE end - prev_level = current_level - else - folds[line_num] = prev_level + FOLD_BLANK - end - line_num = line_num + 1 - end - elseif fold and (lexer._FOLDBYINDENTATION or - M.property_int['fold.by.indentation'] > 0) then - -- Indentation based folding. - -- Calculate indentation per line. - local indentation = {} - for indent, line in (text..'\n'):gmatch('([\t ]*)([^\r\n]*)\r?\n') do - indentation[#indentation + 1] = line ~= '' and #indent - end - -- Find the first non-blank line before start_line. If the current line is - -- indented, make that previous line a header and update the levels of any - -- blank lines inbetween. If the current line is blank, match the level of - -- the previous non-blank line. - local current_level = start_level - for i = start_line - 1, 0, -1 do - local level = M.fold_level[i] - if level >= FOLD_HEADER then level = level - FOLD_HEADER end - if level < FOLD_BLANK then - local indent = M.indent_amount[i] - if indentation[1] and indentation[1] > indent then - folds[i] = FOLD_BASE + indent + FOLD_HEADER - for j = i + 1, start_line - 1 do - folds[j] = start_level + FOLD_BLANK - end - elseif not indentation[1] then - current_level = FOLD_BASE + indent - end - break - end - end - -- Iterate over lines, setting fold numbers and fold flags. - for i = 1, #indentation do - if indentation[i] then - current_level = FOLD_BASE + indentation[i] - folds[start_line + i - 1] = current_level - for j = i + 1, #indentation do - if indentation[j] then - if FOLD_BASE + indentation[j] > current_level then - folds[start_line + i - 1] = current_level + FOLD_HEADER - current_level = FOLD_BASE + indentation[j] -- for any blanks below - end - break - end - end - else - folds[start_line + i - 1] = current_level + FOLD_BLANK - end - end - else - -- No folding, reset fold levels if necessary. - local current_line = start_line - for _ in text:gmatch('\r?\n') do - folds[current_line] = start_level - current_line = current_line + 1 - end - end - return folds -end - ---- --- Creates a returns a new lexer with the given name. --- @param name The lexer's name. --- @param opts Table of lexer options. Options currently supported: --- * `lex_by_line`: Whether or not the lexer only processes whole lines of --- text (instead of arbitrary chunks of text) at a time. --- Line lexers cannot look ahead to subsequent lines. --- The default value is `false`. --- * `fold_by_indentation`: Whether or not the lexer does not define any fold --- points and that fold points should be calculated based on changes in line --- indentation. --- The default value is `false`. --- * `case_insensitive_fold_points`: Whether or not fold points added via --- `lexer:add_fold_point()` ignore case. --- The default value is `false`. --- * `inherit`: Lexer to inherit from. --- The default value is `nil`. --- @usage l.new('rhtml', {inherit = l.load('html')}) --- @name new -function M.new(name, opts) - local lexer = { - _NAME = assert(name, 'lexer name expected'), - _LEXBYLINE = opts and opts['lex_by_line'], - _FOLDBYINDENTATION = opts and opts['fold_by_indentation'], - _CASEINSENSITIVEFOLDPOINTS = opts and opts['case_insensitive_fold_points'], - _lexer = opts and opts['inherit'] - } - - -- Create the initial maps for token names to style numbers and styles. - local token_styles = {} - for i = 1, #default do token_styles[default[i]] = i - 1 end - for i = 1, #predefined do token_styles[predefined[i]] = i + 31 end - lexer._TOKENSTYLES, lexer._numstyles = token_styles, #default - lexer._EXTRASTYLES = {} - - return setmetatable(lexer, {__index = { - add_rule = M.add_rule, modify_rule = M.modify_rule, add_style = M.add_style, - add_fold_point = M.add_fold_point, join_tokens = join_tokens, - build_grammar = build_grammar, embed = M.embed, lex = M.lex, fold = M.fold - }}) -end - --- Legacy support for older lexers. --- Processes the `lexer._rules`, `lexer._tokenstyles`, and `lexer._foldsymbols` --- tables. --- Since legacy lexers may be processed up to twice, ensure their default styles --- and rules are not processed more than once. -local function process_legacy_lexer(lexer) - local function warn(msg) --[[io.stderr:write(msg, "\n")]] end - if not lexer._LEGACY then - lexer._LEGACY = true - warn("lexers as tables are deprecated; use 'lexer.new()'") - local token_styles = {} - for i = 1, #default do token_styles[default[i]] = i - 1 end - for i = 1, #predefined do token_styles[predefined[i]] = i + 31 end - lexer._TOKENSTYLES, lexer._numstyles = token_styles, #default - lexer._EXTRASTYLES = {} - setmetatable(lexer, getmetatable(M.new(''))) - if lexer._rules then - warn("lexer '_rules' table is deprecated; use 'add_rule()'") - for i = 1, #lexer._rules do - lexer:add_rule(lexer._rules[i][1], lexer._rules[i][2]) - end - end - end - if lexer._tokenstyles then - warn("lexer '_tokenstyles' table is deprecated; use 'add_style()'") - for token, style in pairs(lexer._tokenstyles) do - -- If this legacy lexer is being processed a second time, only add styles - -- added since the first processing. - if not lexer._TOKENSTYLES[token] then lexer:add_style(token, style) end - end - end - if lexer._foldsymbols then - warn("lexer '_foldsymbols' table is deprecated; use 'add_fold_point()'") - for token_name, symbols in pairs(lexer._foldsymbols) do - if type(symbols) == 'table' and token_name ~= '_patterns' then - for symbol, v in pairs(symbols) do - lexer:add_fold_point(token_name, symbol, v) - end - end - end - if lexer._foldsymbols._case_insensitive then - lexer._CASEINSENSITIVEFOLDPOINTS = true - end - end -end - -local lexers = {} -- cache of loaded lexers ---- --- Initializes or loads and returns the lexer of string name *name*. --- Scintilla calls this function in order to load a lexer. Parent lexers also --- call this function in order to load child lexers and vice-versa. The user --- calls this function in order to load a lexer when using Scintillua as a Lua --- library. --- @param name The name of the lexing language. --- @param alt_name The alternate name of the lexing language. This is useful for --- embedding the same child lexer with multiple sets of start and end tokens. --- @param cache Flag indicating whether or not to load lexers from the cache. --- This should only be `true` when initially loading a lexer (e.g. not from --- within another lexer for embedding purposes). --- The default value is `false`. --- @return lexer object --- @name load -function M.load(name, alt_name, cache) - if cache and lexers[alt_name or name] then return lexers[alt_name or name] end - - -- When using Scintillua as a stand-alone module, the `property` and - -- `property_int` tables do not exist (they are not useful). Create them in - -- order prevent errors from occurring. - if not M.property then - M.property, M.property_int = {}, setmetatable({}, { - __index = function(t, k) return tonumber(M.property[k]) or 0 end, - __newindex = function() error('read-only property') end - }) - end - - -- Load the language lexer with its rules, styles, etc. - -- However, replace the default `WHITESPACE` style name with a unique - -- whitespace style name (and then automatically add it afterwards), since - -- embedded lexing relies on these unique whitespace style names. Note that - -- loading embedded lexers changes `WHITESPACE` again, so when adding it - -- later, do not reference the potentially incorrect value. - M.WHITESPACE = (alt_name or name)..'_whitespace' - local lexer = dofile(assert(package.searchpath(name, M.path))) - assert(lexer, string.format("'%s.lua' did not return a lexer", name)) - if alt_name then lexer._NAME = alt_name end - if not getmetatable(lexer) or lexer._LEGACY then - -- A legacy lexer may need to be processed a second time in order to pick up - -- any `_tokenstyles` or `_foldsymbols` added after `l.embed_lexer()`. - process_legacy_lexer(lexer) - if lexer._lexer and lexer._lexer._LEGACY then - process_legacy_lexer(lexer._lexer) -- mainly for `_foldsymbols` edits - end - end - lexer:add_style((alt_name or name)..'_whitespace', M.STYLE_WHITESPACE) - - -- If the lexer is a proxy or a child that embedded itself, set the parent to - -- be the main lexer. - if lexer._lexer then lexer = lexer._lexer end - - lexers[alt_name or name] = lexer - return lexer -end - --- The following are utility functions lexers will have access to. - --- Common patterns. -M.any = lpeg_P(1) -M.ascii = lpeg_R('\000\127') -M.extend = lpeg_R('\000\255') -M.alpha = lpeg_R('AZ', 'az') -M.digit = lpeg_R('09') -M.alnum = lpeg_R('AZ', 'az', '09') -M.lower = lpeg_R('az') -M.upper = lpeg_R('AZ') -M.xdigit = lpeg_R('09', 'AF', 'af') -M.cntrl = lpeg_R('\000\031') -M.graph = lpeg_R('!~') -M.print = lpeg_R(' ~') -M.punct = lpeg_R('!/', ':@', '[\'', '{~') -M.space = lpeg_S('\t\v\f\n\r ') - -M.newline = lpeg_S('\r\n\f')^1 -M.nonnewline = 1 - M.newline -M.nonnewline_esc = 1 - (M.newline + '\\') + '\\' * M.any - -M.dec_num = M.digit^1 -M.hex_num = '0' * lpeg_S('xX') * M.xdigit^1 -M.oct_num = '0' * lpeg_R('07')^1 -M.integer = lpeg_S('+-')^-1 * (M.hex_num + M.oct_num + M.dec_num) -M.float = lpeg_S('+-')^-1 * - ((M.digit^0 * '.' * M.digit^1 + M.digit^1 * '.' * M.digit^0) * - (lpeg_S('eE') * lpeg_S('+-')^-1 * M.digit^1)^-1 + - (M.digit^1 * lpeg_S('eE') * lpeg_S('+-')^-1 * M.digit^1)) - -M.word = (M.alpha + '_') * (M.alnum + '_')^0 - ---- --- Creates and returns a token pattern with token name *name* and pattern --- *patt*. --- If *name* is not a predefined token name, its style must be defined in the --- lexer's `_tokenstyles` table. --- @param name The name of token. If this name is not a predefined token name, --- then a style needs to be assiciated with it in the lexer's `_tokenstyles` --- table. --- @param patt The LPeg pattern associated with the token. --- @return pattern --- @usage local ws = token(l.WHITESPACE, l.space^1) --- @usage local annotation = token('annotation', '@' * l.word) --- @name token -function M.token(name, patt) - return lpeg_Cc(name) * patt * lpeg_Cp() -end - ---- --- Creates and returns a pattern that matches a range of text bounded by --- *chars* characters. --- This is a convenience function for matching more complicated delimited ranges --- like strings with escape characters and balanced parentheses. *single_line* --- indicates whether or not the range must be on a single line, *no_escape* --- indicates whether or not to ignore '\' as an escape character, and *balanced* --- indicates whether or not to handle balanced ranges like parentheses and --- requires *chars* to be composed of two characters. --- @param chars The character(s) that bound the matched range. --- @param single_line Optional flag indicating whether or not the range must be --- on a single line. --- @param no_escape Optional flag indicating whether or not the range end --- character may be escaped by a '\\' character. --- @param balanced Optional flag indicating whether or not to match a balanced --- range, like the "%b" Lua pattern. This flag only applies if *chars* --- consists of two different characters (e.g. "()"). --- @return pattern --- @usage local dq_str_escapes = l.delimited_range('"') --- @usage local dq_str_noescapes = l.delimited_range('"', false, true) --- @usage local unbalanced_parens = l.delimited_range('()') --- @usage local balanced_parens = l.delimited_range('()', false, false, true) --- @see nested_pair --- @name delimited_range -function M.delimited_range(chars, single_line, no_escape, balanced) - local s = chars:sub(1, 1) - local e = #chars == 2 and chars:sub(2, 2) or s - local range - local b = balanced and s or '' - local n = single_line and '\n' or '' - if no_escape then - local invalid = lpeg_S(e..n..b) - range = M.any - invalid - else - local invalid = lpeg_S(e..n..b) + '\\' - range = M.any - invalid + '\\' * M.any - end - if balanced and s ~= e then - return lpeg_P{s * (range + lpeg_V(1))^0 * e} - else - return s * range^0 * lpeg_P(e)^-1 - end -end - ---- --- Creates and returns a pattern that matches pattern *patt* only at the --- beginning of a line. --- @param patt The LPeg pattern to match on the beginning of a line. --- @return pattern --- @usage local preproc = token(l.PREPROCESSOR, l.starts_line('#') * --- l.nonnewline^0) --- @name starts_line -function M.starts_line(patt) - return lpeg_Cmt(lpeg_C(patt), function(input, index, match, ...) - local pos = index - #match - if pos == 1 then return index, ... end - local char = input:sub(pos - 1, pos - 1) - if char == '\n' or char == '\r' or char == '\f' then return index, ... end - end) -end - ---- --- Creates and returns a pattern that verifies that string set *s* contains the --- first non-whitespace character behind the current match position. --- @param s String character set like one passed to `lpeg.S()`. --- @return pattern --- @usage local regex = l.last_char_includes('+-*!%^&|=,([{') * --- l.delimited_range('/') --- @name last_char_includes -function M.last_char_includes(s) - s = '['..s:gsub('[-%%%[]', '%%%1')..']' - return lpeg_P(function(input, index) - if index == 1 then return index end - local i = index - while input:sub(i - 1, i - 1):match('[ \t\r\n\f]') do i = i - 1 end - if input:sub(i - 1, i - 1):match(s) then return index end - end) -end - ---- --- Returns a pattern that matches a balanced range of text that starts with --- string *start_chars* and ends with string *end_chars*. --- With single-character delimiters, this function is identical to --- `delimited_range(start_chars..end_chars, false, true, true)`. --- @param start_chars The string starting a nested sequence. --- @param end_chars The string ending a nested sequence. --- @return pattern --- @usage local nested_comment = l.nested_pair('/*', '*/') --- @see delimited_range --- @name nested_pair -function M.nested_pair(start_chars, end_chars) - local s, e = start_chars, lpeg_P(end_chars)^-1 - return lpeg_P{s * (M.any - s - end_chars + lpeg_V(1))^0 * e} -end - ---- --- Creates and returns a pattern that matches any single word in string *words*. --- *case_insensitive* indicates whether or not to ignore case when matching --- words. --- This is a convenience function for simplifying a set of ordered choice word --- patterns. --- @param words A string list of words separated by spaces. --- @param case_insensitive Optional boolean flag indicating whether or not the --- word match is case-insensitive. The default value is `false`. --- @param word_chars Unused legacy parameter. --- @return pattern --- @usage local keyword = token(l.KEYWORD, word_match[[foo bar baz]]) --- @usage local keyword = token(l.KEYWORD, word_match([[foo-bar foo-baz --- bar-foo bar-baz baz-foo baz-bar]], true)) --- @name word_match -function M.word_match(words, case_insensitive, word_chars) - local word_list = {} - if type(words) == 'table' then - -- Legacy `word_match(word_list, word_chars, case_insensitive)` form. - words = table.concat(words, ' ') - word_chars, case_insensitive = case_insensitive, word_chars - end - for word in words:gmatch('%S+') do - word_list[case_insensitive and word:lower() or word] = true - for char in word:gmatch('[^%w_]') do - if not (word_chars or ''):find(char, 1, true) then - word_chars = (word_chars or '')..char - end - end - end - local chars = M.alnum + '_' - if (word_chars or '') ~= '' then chars = chars + lpeg_S(word_chars) end - return lpeg_Cmt(chars^1, function(input, index, word) - if case_insensitive then word = word:lower() end - return word_list[word] and index or nil - end) -end - --- Deprecated legacy function. Use `parent:embed()` instead. --- Embeds child lexer *child* in parent lexer *parent* using patterns --- *start_rule* and *end_rule*, which signal the beginning and end of the --- embedded lexer, respectively. --- @param parent The parent lexer. --- @param child The child lexer. --- @param start_rule The pattern that signals the beginning of the embedded --- lexer. --- @param end_rule The pattern that signals the end of the embedded lexer. --- @usage l.embed_lexer(M, css, css_start_rule, css_end_rule) --- @usage l.embed_lexer(html, M, php_start_rule, php_end_rule) --- @usage l.embed_lexer(html, ruby, ruby_start_rule, ruby_end_rule) --- @see embed --- @name embed_lexer -function M.embed_lexer(parent, child, start_rule, end_rule) - if not getmetatable(parent) then process_legacy_lexer(parent) end - if not getmetatable(child) then process_legacy_lexer(child) end - parent:embed(child, start_rule, end_rule) -end - --- Determines if the previous line is a comment. --- This is used for determining if the current comment line is a fold point. --- @param prefix The prefix string defining a comment. --- @param text The text passed to a fold function. --- @param pos The pos passed to a fold function. --- @param line The line passed to a fold function. --- @param s The s passed to a fold function. -local function prev_line_is_comment(prefix, text, pos, line, s) - local start = line:find('%S') - if start < s and not line:find(prefix, start, true) then return false end - local p = pos - 1 - if text:sub(p, p) == '\n' then - p = p - 1 - if text:sub(p, p) == '\r' then p = p - 1 end - if text:sub(p, p) ~= '\n' then - while p > 1 and text:sub(p - 1, p - 1) ~= '\n' do p = p - 1 end - while text:sub(p, p):find('^[\t ]$') do p = p + 1 end - return text:sub(p, p + #prefix - 1) == prefix - end - end - return false -end - --- Determines if the next line is a comment. --- This is used for determining if the current comment line is a fold point. --- @param prefix The prefix string defining a comment. --- @param text The text passed to a fold function. --- @param pos The pos passed to a fold function. --- @param line The line passed to a fold function. --- @param s The s passed to a fold function. -local function next_line_is_comment(prefix, text, pos, line, s) - local p = text:find('\n', pos + s) - if p then - p = p + 1 - while text:sub(p, p):find('^[\t ]$') do p = p + 1 end - return text:sub(p, p + #prefix - 1) == prefix - end - return false -end - ---- --- Returns a fold function (to be passed to `lexer:add_fold_point()`) that folds --- consecutive line comments that start with string *prefix*. --- @param prefix The prefix string defining a line comment. --- @usage lexer:add_fold_point(l.COMMENT, '--', l.fold_line_comments('--')) --- @usage lexer:add_fold_point(l.COMMENT, '//', l.fold_line_comments('//')) --- @name fold_line_comments -function M.fold_line_comments(prefix) - local property_int = M.property_int - return function(text, pos, line, s) - if property_int['fold.line.comments'] == 0 then return 0 end - if s > 1 and line:match('^%s*()') < s then return 0 end - local prev_line_comment = prev_line_is_comment(prefix, text, pos, line, s) - local next_line_comment = next_line_is_comment(prefix, text, pos, line, s) - if not prev_line_comment and next_line_comment then return 1 end - if prev_line_comment and not next_line_comment then return -1 end - return 0 - end -end - -M.property_expanded = setmetatable({}, { - -- Returns the string property value associated with string property *key*, - -- replacing any "$()" and "%()" expressions with the values of their keys. - __index = function(t, key) - return M.property[key]:gsub('[$%%]%b()', function(key) - return t[key:sub(3, -2)] - end) - end, - __newindex = function() error('read-only property') end -}) - ---[[ The functions and fields below were defined in C. - ---- --- Returns the line number of the line that contains position *pos*, which --- starts from 1. --- @param pos The position to get the line number of. --- @return number -local function line_from_position(pos) end -]] - -return M diff --git a/lexlua/mumps.lua b/lexlua/mumps.lua deleted file mode 100644 index 8a7d7d8f1..000000000 --- a/lexlua/mumps.lua +++ /dev/null @@ -1,112 +0,0 @@ --- Copyright 2015-2018 Mitchell mitchell.att.foicica.com. See License.txt. --- MUMPS (M) LPeg lexer. - -local l = require('lexer') -local token, word_match = l.token, l.word_match -local P, R, S = lpeg.P, lpeg.R, lpeg.S - -local M = {_NAME = 'mumps'} - --- Whitespace. -local ws = token(l.WHITESPACE, l.space^1) - --- Comments. -local comment = token(l.COMMENT, ';' * l.nonnewline_esc^0) - --- Strings. -local string = token(l.STRING, l.delimited_range('"', true)) - --- Numbers. -local number = token(l.NUMBER, l.float + l.integer) -- TODO: float? - --- Keywords. -local keyword = token(l.KEYWORD, word_match({ - -- Abbreviations. - 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'q', - 'r', 's', 'u', 'v', 'w', 'x', - -- Full. - 'break', 'close', 'do', 'else', 'for', 'goto', 'halt', 'hang', 'if', 'job', - 'kill', 'lock', 'merge', 'new', 'open', 'quit', 'read', 'set', 'use', 'view', - 'write', 'xecute', - -- Cache- or GTM-specific. - 'catch', 'continue', 'elseif', 'tcommit', 'throw', 'trollback', 'try', - 'tstart', 'while', -}, nil, true)) - --- Functions. -local func = token(l.FUNCTION, '$' * word_match({ - -- Abbreviations. - 'a', 'c', 'd', 'e', 'f', 'fn', 'g', 'j', 'l', 'n', 'na', 'o', 'p', 'q', 'ql', - 'qs', 'r', 're', 's', 'st', 't', 'tr', 'v', - -- Full. - 'ascii', 'char', 'data', 'extract', 'find', 'fnumber', 'get', 'justify', - 'length', 'next', 'name', 'order', 'piece', 'query', 'qlength', 'qsubscript', - 'random', 'reverse', 'select', 'stack', 'text', 'translate', 'view', - -- Z function abbreviations. - 'zd', 'zdh', 'zdt', 'zdth', 'zh', 'zt', 'zth', 'zu', 'zp', - -- Z functions. - 'zabs', 'zarccos', 'zarcsin', 'zarctan', 'zcos', 'zcot', 'zcsc', 'zdate', - 'zdateh', 'zdatetime', 'zdatetimeh', 'zexp', 'zhex', 'zln', 'zlog', 'zpower', - 'zsec', 'zsin', 'zsqr', 'ztan', 'ztime', 'ztimeh', 'zutil', 'zf', 'zprevious', - -- Cache- or GTM-specific. - 'bit', 'bitcount', 'bitfind', 'bitlogic', 'case', 'classmethod', 'classname', - 'decimal', 'double', 'factor', 'i', 'increment', 'inumber', 'isobject', - 'isvaliddouble', 'isvalidnum', 'li', 'list', 'lb', 'listbuild', 'ld', - 'listdata', 'lf', 'listfind', 'lfs', 'listfromstring', 'lg', 'listget', 'll', - 'listlength', 'listnext', 'ls', 'listsame', 'lts', 'listtostring', 'lv', - 'listvalid', 'locate', 'match', 'method', 'nc', 'nconvert', 'normalize', - 'now', 'num', 'number', 'parameter', 'prefetchoff', 'prefetchon', 'property', - 'replace', 'sc', 'sconvert', 'sortbegin', 'sortend', 'wa', 'wascii', 'wc', - 'wchar', 'we', 'wextract', 'wf', 'wfind', 'wiswide', 'wl', 'wlength', 'wre', - 'wreverse', 'xecute' -}, nil, true)) - --- Variables. -local variable = token(l.VARIABLE, '$' * l.word_match({ - -- Abbreviations. - 'ec', 'es', 'et', 'h', 'i', 'j', 'k', 'p', 'q', 's', 'st', 't', 'tl', - -- Full. - 'device', 'ecode', 'estack', 'etrap', 'halt', 'horolog', 'io', 'job', - 'namespace', 'principal', 'quit', 'roles', 'storage', 'stack', 'system', - 'test', 'this', 'tlevel', 'username', 'x', 'y', - -- Z variable abbreviations. - 'za', 'zb', 'zc', 'ze', 'zh', 'zi', 'zj', 'zm', 'zn', 'zo', 'zp', 'zr', 'zs', - 'zt', 'zts', 'ztz', 'zv', - -- Z variables. - 'zchild', 'zeof', 'zerror', 'zhorolog', 'zio', 'zjob', 'zmode', 'zname', - 'znspace', 'zorder', 'zparent', 'zpi', 'zpos', 'zreference', 'zstorage', - 'ztimestamp', 'ztimezone', 'ztrap', 'zversion', -}, nil, true)) - --- Function entity. -local entity = token(l.LABEL, l.starts_line(('%' + l.alpha) * l.alnum^0)) - --- Support functions. -local support_function = '$$' * ('%' + l.alpha) * l.alnum^0 * - (('%' + l.alpha) * l.alnum^0)^-1 - --- Identifiers. -local identifier = token(l.IDENTIFIER, l.alpha * l.alnum^0) - --- Operators. -local operator = token(l.OPERATOR, S('+-/*<>!=_@#&|?:\\\',()[]')) - -M._rules = { - {'whitespace', ws}, - {'keyword', keyword}, - {'variable', variable}, - {'identifier', identifier}, - {'string', string}, - {'comment', comment}, - {'number', number}, - {'operator', operator}, -} - -M._foldsymbols = { - _patterns = {'%l+', '[{}]', '/%*', '%*/', '//'}, - [l.PREPROCESSOR] = {['if'] = 1, ifdef = 1, ifndef = 1, endif = -1}, - [l.OPERATOR] = {['{'] = 1, ['}'] = -1}, - [l.COMMENT] = {['/*'] = 1, ['*/'] = -1, ['//'] = l.fold_line_comments('//')} -} - -return M diff --git a/lexlua/ps.lua.orig b/lexlua/ps.lua.orig deleted file mode 100644 index c6a98faa9..000000000 --- a/lexlua/ps.lua.orig +++ /dev/null @@ -1,167 +0,0 @@ --- Copyright 2017 Marcio Baraco <marciorps@gmail.com>. See LICENSE. --- Postscript LPeg lexer. - -local l = require('lexer') -local token, word_match = l.token, l.word_match -local P, R, S = lpeg.P, lpeg.R, lpeg.S - -local M = {_NAME = 'ps'} - --- Whitespace. -local ws = token(l.WHITESPACE, l.space^1) - --- Comments. -local comment = token(l.COMMENT, '%' * l.nonnewline^0) - --- Strings. -local nested_string = l.delimited_range('()', false, false, true) -local hex_string = P('<') * (l.xdigit + l.space)^0 * P('>')^-1 -local enc_string = P('<~') * (R('!u') + l.space)^0 * P('~>') -local str = token(l.STRING, nested_string + hex_string + enc_string) - --- Numbers. -local frac = (P('.') * l.digit^1) -local expo = (S('eE') * S('+-')^-1 * l.digit^1) -local decm = S('+-')^-1 * l.digit ^ 1 * frac^-1 * expo^-1 -local radx = l.digit^-2 * '#' * l.alnum^1 --- TODO: Accept only chars that fit radix, ie [01] for 2#, hex for 16# and so. -local number = token(l.NUMBER, decm + radx) - --- PostScript allows almost all characters in names. -local word = (l.graph - S('()<>[]{}/%'))^1 --- Names. -local identifier = token(l.IDENTIFIER, word) --- Deferred Names. -local label = token(l.LABEL, '/' * word) --- Immediately Evaluated Names. -local preproc = token(l.PREPROCESSOR, '//' * word) - --- Object constructors. -local operator = token(l.OPERATOR, S('[]{}=') + P('<<') + P('>>') + P('==')) - --- Operators: --- + l.KEYWORD for basic ops --- + l.FUNCTION for graphic ops --- + l.CLASS for weird ps ops -local keyword = token(l.KEYWORD, word_match{ - -- Control operators. - 'exec', 'eexec', 'if', 'ifelse', 'for', 'repeat', 'loop', 'exit', 'stop', - 'stopped', 'countexecstack', 'execstack', 'quit', 'start', - -- Stack manipulation operators. - 'pop', 'exch', 'dup', 'copy', 'index', 'roll', 'clear', 'count', 'mark', - 'cleartomark', 'counttomark', - -- Array and operators. - 'array', 'string', 'length', 'get', 'put', 'getinterval', 'putinterval', - 'aload', 'astore', 'packedarray', 'setpacking', 'currentpacking', 'forall', - 'anchorsearch', 'search', 'token', - -- Dictionary operators. - 'dict', 'maxlength', 'begin', 'end', 'def', 'undef', 'load', 'store', 'known', - 'where', 'currentdict', 'errordict', 'systemdict', 'userdict', 'globaldict', - 'shareddict', 'statusdict', 'countdictstack', 'cleardictstack', 'dictstack', - -- Type, attribute and conversion operators. - 'type', 'cvlit', 'cvx', 'cvi', 'cvn', 'cvrs', 'cvs', 'cvr', 'xcheck', - 'executeonly', 'noaccess', 'readonly', 'rcheck', 'wcheck', - -- Arithmetic and math operators. - 'add', 'div', 'idiv', 'mod', 'mul', 'sub', 'abs', 'neg', 'ceiling', 'floor', - 'round', 'truncate', 'sqrt', 'atan', 'cos', 'sin', 'exp', 'ln', 'log', 'rand', - 'srand', 'rrand', - -- Relational, boolean and bitwise operators. - 'eq', 'ne', 'ge', 'gt', 'le', 'lt', 'and', 'not', 'or', 'xor', 'true', - 'false', 'bitshift', - -- Coordinate system and matrix operators. - 'matrix', 'initmatrix', 'identmatrix', 'defaultmatrix', 'currentmatrix', - 'setmatrix', 'translate', 'scale', 'rotate', 'concat', 'concatmatrix', - 'transform', 'dtransform', 'itransform', 'idtransform', 'invertmatrix', -}) -local func = token(l.FUNCTION, word_match{ - -- Path construction operators. - 'newpath', 'currentpoint', 'moveto', 'rmoveto', 'lineto', 'rlineto', 'arc', - 'arcn', 'arct', 'arcto', 'curveto', 'rcurveto', 'closepath', 'flattenpath', - 'reversepath', 'strokepath', 'ustrokepath', 'charpath', 'uappend', 'clippath', - 'setbbox', 'pathbbox', 'pathforall', 'upath', 'ucache', 'initclip', 'clip', - 'eoclip', 'rectclip', - -- Glyph and font operators. - 'definefont', 'composefont', 'undefinefont', 'findfont', 'scalefont', - 'makefont', 'setfont', 'rootfont', 'currentfont', 'selectfont', 'show', - 'ashow', 'widthshow', 'awidthshow', 'xshow', 'yshow', 'xyshow', 'glyphshow', - 'stringwidth', 'cshow', 'kshow', 'findencoding', 'FontDirectory', - 'GlobalFontDirectory', 'SharedFontDirectory', 'StandardEncoding', - 'ISOLatin1Encoding', 'setcachedevice', 'setcachedevice2', 'setcharwidth', - -- CID Font operators. - 'addglyph', 'beginbfchar', 'beginbfrange', 'begincidchar', 'begincidrange', - 'begincmap', 'begincodespacerange', 'beginnotdefchar', 'beginnotdefrange', - 'beginrearrangedfont', 'beginusematrix', 'endbfchar', 'endbfrange', - 'endcidchar', 'endcidrange', 'endcmap', 'endcodespacerange', 'endnotdefchar', - 'endnotdefrange', 'endrearrangedfont', 'endusermatrix', 'removeall', - 'removeglyphs', 'StartData', 'usecmap', 'usefont', - -- Painting operations. - 'erasepage', 'stroke', 'fill', 'eofill', 'rectstroke', 'rectfill', 'ustroke', - 'ufill', 'ueofill', 'shfill', 'image', 'imagemask', 'colorimage', - -- Insideness testing operators. - 'infill', 'ineofill', 'inufill', 'inueofill', 'instroke', 'inustroke', - -- Form and pattern operators. - 'makepattern', 'setpattern', 'execform', - -- Graphics state operators. - 'gsave', 'grestore', 'clipsave', 'cliprestore', 'grestoreall', 'initgraphics', - 'gstate', 'setgstate', 'currentgstate', 'setlinewidth', 'currentlinewidth', - 'setlinecap', 'currentlinecap', 'setlinejoin', 'currentlinejoin', - 'setmiterlimit', 'currentmiterlimit', 'setstrokeadjust', - 'currentstrokeadjust', 'setdash', 'currentdash', 'setcolorspace', - 'currentcolorspace', 'setcolor', 'setgray', 'currentgray', 'sethsbcolor', - 'currenthsbcolor', 'setrgbcolor', 'currentrgbcolor', 'setcmykcolor', - 'currentcmykcolor', 'sethalftone', 'currenthalftone', 'setscreen', - 'currentscreen', 'setcolorscreen', 'currentcolorscreen', 'settransfer', - 'currenttransfer', 'setcolortransfer', 'currentcolortransfer', - 'setblackgeneration', 'currentblackgeneration', 'setundercolorremoval', - 'currentundercolorremoval', 'setcolorrendering', 'currentcolorrendering', - 'setflat', 'currentflat', 'setoverprint', 'currentoverprint', 'setsmoothness', - 'currentsmoothness', 'currentcolor', - -- Device setup operators. - 'showpage', 'copypage', 'setpagedevice', 'currentpagedevice', 'nulldevice', - 'currenttrapparams', 'settrapparams', 'settrapzone', -}) -local misc = token(l.CLASS, word_match{ - -- Miscellaneous operators - 'defineresource', 'undefineresource', 'findresource', 'findcolorrendering', - 'resourcestatus', 'resourceforall', 'GetHalftoneName', 'GetPageDeviceName', - 'GetSubstituteCRD', 'save', 'restore', 'setglobal', 'setshared', - 'currentglobal', 'gcheck', 'scheck', 'startjob', 'defineuserobject', - 'execuserobject', 'undefineuserobject', 'UserObjects', 'bind', 'null', - 'version', 'realtime', 'usertime', 'languagelevel', 'product', 'revision', - 'serialnumber', 'executive', 'echo', 'prompt', 'setsystemparams', - 'currentsystemparams', 'setuserparams', 'currentuserparams', 'setdevparams', - 'currentdevparams', 'vmreclaim', 'setvmthreshold', 'vmstatus', 'cachestatus', - 'setcachelimit', 'setcacheparams', 'currentcacheparams', 'setucacheparams', - 'ucachestatus', 'currentshared', 'exitserver', 'serverdict', - -- File operators - 'file', 'filter', 'closefile', 'read', 'write', 'readhexstring', - 'writehexstring', 'readstring', 'writestring', 'readline', 'bytesavailable', - 'flush', 'flushfile', 'resetfile', 'status', 'run', 'currentfile', - 'deletefile', 'renamefile', 'filenameforall', 'setfileposition', - 'fileposition', 'print', 'stack', 'pstack', 'printobject', 'writeobject', - 'setobjectformat', 'currentobjectformat', - -- Errors. - 'configurationerror', 'dictfull', 'dictstackoverflow', 'dictstackunderflow', - 'execstackoverflow', 'handleerror', 'interrupt', 'invalidaccess', - 'invalidexit', 'invalidfileaccess', 'invalidfont', 'invalidrestore', - 'ioerror', 'limitcheck', 'nocurrentpoint', 'rangecheck', 'stackoverflow', - 'stackunderflow', 'syntaxerror', 'timeout', 'typecheck', 'undefined', - 'undefinedfilename', 'undefinedresource', 'undefinedresult', 'unmatchedmark', - 'unregistered', 'VMerror', -}) - -M._rules = { - {'whitespace', ws}, - {'comment', comment}, - {'number', number}, - {'preprocessor', preproc}, - {'label', label}, - {'keyword', keyword}, - {'function', func}, - {'class', misc}, - {'operator', operator}, - {'string', str}, - {'identifier', identifier}, -} - -return M |