diff options
| author | mitchell <unknown> | 2020-04-25 16:26:31 -0400 |
|---|---|---|
| committer | mitchell <unknown> | 2020-04-25 16:26:31 -0400 |
| commit | fad15f79b1230b3076be515d6894c8919562809b (patch) | |
| tree | 72c848ef02c3331de5ca54eff7adaea3a9a6fb88 /lexlua/lexer.lua | |
| parent | 1fd02a367dec125c0b49dd9246a0928433866b96 (diff) | |
| download | scintilla-mirror-fad15f79b1230b3076be515d6894c8919562809b.tar.gz | |
Reformatted Lua LPeg lexers and added new convenience functions and pattern.
`lexer.range()` replaces `lexer.delimited_range()` and `lexer.nested_pair()`.
`lexer.to_eol()` replaces `patt * lexer.nonnewline^0` constructs.
`lexer.number` replaces `lexer.float + lexer.integer`.
Also added unit tests for lexer functions.
Diffstat (limited to 'lexlua/lexer.lua')
| -rw-r--r-- | lexlua/lexer.lua | 128 |
1 files changed, 94 insertions, 34 deletions
diff --git a/lexlua/lexer.lua b/lexlua/lexer.lua index 68183aa29..d133eb11d 100644 --- a/lexlua/lexer.lua +++ b/lexlua/lexer.lua @@ -139,10 +139,10 @@ local M = {} -- [`lexer.punct`](), [`lexer.space`](), [`lexer.newline`](), -- [`lexer.nonnewline`](), [`lexer.nonnewline_esc`](), [`lexer.dec_num`](), -- [`lexer.hex_num`](), [`lexer.oct_num`](), [`lexer.integer`](), --- [`lexer.float`](), and [`lexer.word`](). You may use your own token names if --- none of the above fit your language, but an advantage to using predefined --- token names is that your lexer's tokens will inherit the universal syntax --- highlighting color theme used by your text editor. +-- [`lexer.float`](), [`lexer.number`](), and [`lexer.word`](). You may use your +-- own token names if none of the above fit your language, but an advantage to +-- using predefined token names is that your lexer's tokens will inherit the +-- universal syntax highlighting color theme used by your text editor. -- -- ##### Example Tokens -- @@ -185,9 +185,8 @@ local M = {} -- -- Line-style comments with a prefix character(s) are easy to express with LPeg: -- --- local shell_comment = token(lexer.COMMENT, '#' * lexer.nonnewline^0) --- local c_line_comment = token(lexer.COMMENT, --- '//' * lexer.nonnewline_esc^0) +-- local shell_comment = token(lexer.COMMENT, lexer.to_eol('#')) +-- local c_line_comment = token(lexer.COMMENT, lexer.to_eol('//', true)) -- -- The comments above start with a '#' or "//" and go to the end of the line. -- The second comment recognizes the next line also as a comment if the current @@ -196,8 +195,7 @@ local M = {} -- C-style "block" comments with a start and end delimiter are also easy to -- express: -- --- local c_comment = token(lexer.COMMENT, '/*' * (lexer.any - '*/')^0 * --- P('*/')^-1) +-- local c_comment = token(lexer.COMMENT, lexer.range('/*', '*/')) -- -- This comment starts with a "/\*" sequence and contains anything up to and -- including an ending "\*/" sequence. The ending "\*/" is optional so the lexer @@ -205,21 +203,13 @@ local M = {} -- -- **Strings** -- --- It is tempting to think that a string is not much different from the block --- comment shown above in that both have start and end delimiters: +-- Most programming languages allow escape sequences in strings such that a +-- sequence like "\\"" in a double-quoted string indicates that the +-- '"' is not the end of the string. [`lexer.range()`]() handles escapes +-- inherently. -- --- local dq_str = '"' * (lexer.any - '"')^0 * P('"')^-1 --- local sq_str = "'" * (lexer.any - "'")^0 * P("'")^-1 --- local simple_string = token(lexer.STRING, dq_str + sq_str) --- --- However, most programming languages allow escape sequences in strings such --- that a sequence like "\\"" in a double-quoted string indicates that the --- '"' is not the end of the string. The above token incorrectly matches --- such a string. Instead, use the [`lexer.delimited_range()`]() convenience --- function. --- --- local dq_str = lexer.delimited_range('"') --- local sq_str = lexer.delimited_range("'") +-- local dq_str = lexer.range('"') +-- local sq_str = lexer.range("'") -- local string = token(lexer.STRING, dq_str + sq_str) -- -- In this case, the lexer treats '\' as an escape character in a string @@ -228,9 +218,9 @@ local M = {} -- **Numbers** -- -- Most programming languages have the same format for integer and float tokens, --- so it might be as simple as using a couple of predefined LPeg patterns: +-- so it might be as simple as using a predefined LPeg pattern: -- --- local number = token(lexer.NUMBER, lexer.float + lexer.integer) +-- local number = token(lexer.NUMBER, lexer.number) -- -- However, some languages allow postfix characters on integers. -- @@ -714,9 +704,9 @@ local M = {} -- lex:add_rule('custom', token('custom', P('quux'))) -- lex:add_style('custom', lexer.STYLE_KEYWORD .. ',bold') -- lex:add_rule('identifier', token(lexer.IDENTIFIER, lexer.word)) --- lex:add_rule('string', token(lexer.STRING, lexer.delimited_range('"'))) --- lex:add_rule('comment', token(lexer.COMMENT, '#' * lexer.nonnewline^0)) --- lex:add_rule('number', token(lexer.NUMBER, lexer.float + lexer.integer)) +-- lex:add_rule('string', token(lexer.STRING, lexer.range('"'))) +-- lex:add_rule('comment', token(lexer.COMMENT, lexer.to_eol('#'))) +-- lex:add_rule('number', token(lexer.NUMBER, lexer.number)) -- lex:add_rule('operator', token(lexer.OPERATOR, S('+-*/%^=<>,.()[]{}'))) -- -- lex:add_fold_point(lexer.OPERATOR, '{', '}') @@ -769,7 +759,7 @@ local M = {} -- #### Acknowledgements -- -- Thanks to Peter Odding for his [lexer post][] on the Lua mailing list --- that inspired me, and thanks to Roberto Ierusalimschy for LPeg. +-- that provided inspiration, and thanks to Roberto Ierusalimschy for LPeg. -- -- [lexer post]: http://lua-users.org/lists/lua-l/2007-04/msg00116.html -- @field DEFAULT (string) @@ -906,6 +896,9 @@ local M = {} -- A pattern that matches either a decimal, hexadecimal, or octal number. -- @field float (pattern) -- A pattern that matches a floating point number. +-- @field number (pattern) +-- A pattern that matches a typical number, either a floating point, decimal, +-- hexadecimal, or octal number. -- @field word (pattern) -- A pattern that matches a typical word. Words begin with a letter or -- underscore and consist of alphanumeric and underscore characters. @@ -965,7 +958,8 @@ local function searchpath(name, path) local tried = {} for part in path:gmatch('[^;]+') do local filename = part:gsub('%?', name) - if loadfile(filename) then return filename end + local ok, errmsg = loadfile(filename) + if ok or not errmsg:find('cannot open') then return filename end tried[#tried + 1] = string.format("no file '%s'", filename) end return nil, table.concat(tried, '\n') @@ -1605,6 +1599,7 @@ M.float = lpeg_S('+-')^-1 * ( (M.digit^0 * '.' * M.digit^1 + M.digit^1 * '.' * M.digit^0 * -lpeg_P('.')) * (lpeg_S('eE') * lpeg_S('+-')^-1 * M.digit^1)^-1 + (M.digit^1 * lpeg_S('eE') * lpeg_S('+-')^-1 * M.digit^1)) +M.number = M.float + M.integer M.word = (M.alpha + '_') * (M.alnum + '_')^0 @@ -1625,6 +1620,69 @@ function M.token(name, patt) end --- +-- Creates and returns a pattern that matches from string or pattern *prefix* +-- until the end of the line. +-- *escape* indicates whether the end of the line can be escaped with a '\' +-- character. +-- @param prefix String or pattern prefix to start matching at. +-- @param escape Optional flag indicating whether or not newlines can be escaped +-- by a '\' character. The default value is `false`. +-- @return pattern +-- @usage local line_comment = lexer.to_eol('//') +-- @usage local line_comment = lexer.to_eol(P('#') + ';') +-- @name to_eol +function M.to_eol(prefix, escape) + return prefix * (not escape and M.nonnewline or M.nonnewline_esc)^0 +end + +--- +-- Creates and returns a pattern that matches a range of text bounded by strings +-- or patterns *s* and *e*. +-- This is a convenience function for matching more complicated ranges like +-- strings with escape characters, balanced parentheses, and block comments +-- (nested or not). *e* is optional and defaults to *s*. *single_line* indicates +-- whether or not the range must be on a single line; *escapes* indicates +-- whether or not to allow '\' as an escape character; and *balanced* indicates +-- whether or not to handle balanced ranges like parentheses, and requires *s* +-- and *e* to be different. +-- @param s String or pattern start of a range. +-- @param e Optional string or pattern end of a range. The default value is *s*. +-- @param single_line Optional flag indicating whether or not the range must be +-- on a single line. +-- @param escapes Optional flag indicating whether or not the range end may +-- be escaped by a '\' character. +-- The default value is `false` unless *s* and *e* are identical, +-- single-character strings. In that case, the default value is `true`. +-- @param balanced Optional flag indicating whether or not to match a balanced +-- range, like the "%b" Lua pattern. This flag only applies if *s* and *e* are +-- different. +-- @return pattern +-- @usage local dq_str_escapes = lexer.range('"') +-- @usage local dq_str_noescapes = lexer.range('"', false, false) +-- @usage local unbalanced_parens = lexer.range('(', ')') +-- @usage local balanced_parens = lexer.range('(', ')', false, false, true) +-- @name range +function M.range(s, e, single_line, escapes, balanced) + if type(e) ~= 'string' and type(e) ~= 'userdata' then + e, single_line, escapes, balanced = s, e, single_line, escapes + end + local any = M.any - e + if single_line then any = any - '\n' end + if balanced then any = any - s end + if escapes == nil then + -- Only allow escapes by default for ranges with identical, single-character + -- string delimiters. + escapes = type(s) == 'string' and #s == 1 and s == e + end + if escapes then any = any - '\\' + '\\' * M.any end + if balanced and s ~= e then + return lpeg_P{s * (any + lpeg_V(1))^0 * lpeg_P(e)^-1} + else + return s * any^0 * lpeg_P(e)^-1 + end +end + +-- Deprecated function. Use `lexer.range()` instead. -- Creates and returns a pattern that matches a range of text bounded by -- *chars* characters. -- This is a convenience function for matching more complicated delimited ranges @@ -1647,9 +1705,10 @@ end -- @usage local unbalanced_parens = lexer.delimited_range('()') -- @usage local balanced_parens = lexer.delimited_range('()', false, false, -- true) --- @see nested_pair +-- @see range -- @name delimited_range function M.delimited_range(chars, single_line, no_escape, balanced) + print("lexer.delimited_range() is deprecated, use lexer.range()") local s = chars:sub(1, 1) local e = #chars == 2 and chars:sub(2, 2) or s local range @@ -1692,7 +1751,7 @@ end -- @param s String character set like one passed to `lpeg.S()`. -- @return pattern -- @usage local regex = lexer.last_char_includes('+-*!%^&|=,([{') * --- lexer.delimited_range('/') +-- lexer.range('/') -- @name last_char_includes function M.last_char_includes(s) s = string.format('[%s]', s:gsub('[-%%%[]', '%%%1')) @@ -1704,7 +1763,7 @@ function M.last_char_includes(s) end) end ---- +-- Deprecated function. Use `lexer.range()` instead. -- Returns a pattern that matches a balanced range of text that starts with -- string *start_chars* and ends with string *end_chars*. -- With single-character delimiters, this function is identical to @@ -1713,9 +1772,10 @@ end -- @param end_chars The string ending a nested sequence. -- @return pattern -- @usage local nested_comment = lexer.nested_pair('/*', '*/') --- @see delimited_range +-- @see range -- @name nested_pair function M.nested_pair(start_chars, end_chars) + print("lexer.nested_pair() is deprecated, use lexer.range()") local s, e = start_chars, lpeg_P(end_chars)^-1 return lpeg_P{s * (M.any - s - end_chars + lpeg_V(1))^0 * e} end |
