From fad15f79b1230b3076be515d6894c8919562809b Mon Sep 17 00:00:00 2001 From: mitchell Date: Sat, 25 Apr 2020 16:26:31 -0400 Subject: Reformatted Lua LPeg lexers and added new convenience functions and pattern. `lexer.range()` replaces `lexer.delimited_range()` and `lexer.nested_pair()`. `lexer.to_eol()` replaces `patt * lexer.nonnewline^0` constructs. `lexer.number` replaces `lexer.float + lexer.integer`. Also added unit tests for lexer functions. --- doc/LPegLexer.html | 233 ++++++++++++++++++++++++++--------------------------- 1 file changed, 113 insertions(+), 120 deletions(-) (limited to 'doc/LPegLexer.html') diff --git a/doc/LPegLexer.html b/doc/LPegLexer.html index e31a091b1..3f553e9f9 100644 --- a/doc/LPegLexer.html +++ b/doc/LPegLexer.html @@ -226,6 +226,13 @@ as fold points. For example, the C line } else { would be marked as a fold point. The default is 0. + + + fold.compact + + If fold.compact is set to 1, blank lines + after an ending fold point are included in that fold. + @@ -672,7 +679,7 @@ operator 30 lexer.punct, lexer.space, lexer.newline, lexer.nonnewline, lexer.nonnewline_esc, lexer.dec_num, lexer.hex_num, lexer.oct_num, lexer.integer, - lexer.float, and lexer.word. You may use your own token names if + lexer.float, lexer.number, and lexer.word. You may use your own token names if none of the above fit your language, but an advantage to using predefined token names is that your lexer's tokens will inherit the universal syntax highlighting color theme used by your text editor.

@@ -725,9 +732,8 @@ operator 30

Line-style comments with a prefix character(s) are easy to express with LPeg:


-    local shell_comment = token(lexer.COMMENT, '#' * lexer.nonnewline^0)
-    local c_line_comment = token(lexer.COMMENT,
-                                 '//' * lexer.nonnewline_esc^0)
+    local shell_comment = token(lexer.COMMENT, lexer.to_eol('#'))
+    local c_line_comment = token(lexer.COMMENT, lexer.to_eol('//', true))
     

The comments above start with a '#' or "//" and go to the end of the line. @@ -738,8 +744,7 @@ operator 30 express:


-    local c_comment = token(lexer.COMMENT,
-                            '/*' * (lexer.any - '*/')^0 * P('*/')^-1)
+    local c_comment = token(lexer.COMMENT, lexer.range('/*', '*/'))
     

This comment starts with a "/*" sequence and contains anything up to and @@ -748,24 +753,14 @@ operator 30

Strings

-

It is tempting to think that a string is not much different from the block - comment shown above in that both have start and end delimiters:

- -

-    local dq_str = '"' * (lexer.any - '"')^0 * P('"')^-1
-    local sq_str = "'" * (lexer.any - "'")^0 * P("'")^-1
-    local simple_string = token(lexer.STRING, dq_str + sq_str)
-    
- -

However, most programming languages allow escape sequences in strings such - that a sequence like "\"" in a double-quoted string indicates that the - '"' is not the end of the string. The above token incorrectly matches - such a string. Instead, use the lexer.delimited_range() convenience - function.

+

Most programming languages allow escape sequences in strings such that a + sequence like “\"” in a double-quoted string indicates that the + ‘"’ is not the end of the string. lexer.range() handles escapes + inherently.


-    local dq_str = lexer.delimited_range('"')
-    local sq_str = lexer.delimited_range("'")
+    local dq_str = lexer.range('"')
+    local sq_str = lexer.range("'")
     local string = token(lexer.STRING, dq_str + sq_str)
     
@@ -775,10 +770,10 @@ operator 30

Numbers

Most programming languages have the same format for integer and float tokens, - so it might be as simple as using a couple of predefined LPeg patterns:

+ so it might be as simple as using a predefined LPeg pattern:


-    local number = token(lexer.NUMBER, lexer.float + lexer.integer)
+    local number = token(lexer.NUMBER, lexer.number)
     

However, some languages allow postfix characters on integers.

@@ -1391,11 +1386,11 @@ operator 30 lex:add_rule('whitespace', token(lexer.WHITESPACE, lexer.space^1)) lex:add_rule('keyword', token(lexer.KEYWORD, word_match[[foo bar baz]])) lex:add_rule('custom', token('custom', P('quux'))) - lex:add_style('custom', lexer.STYLE_KEYWORD..',bold') + lex:add_style('custom', lexer.STYLE_KEYWORD .. ',bold') lex:add_rule('identifier', token(lexer.IDENTIFIER, lexer.word)) - lex:add_rule('string', token(lexer.STRING, lexer.delimited_range('"'))) - lex:add_rule('comment', token(lexer.COMMENT, '#' * lexer.nonnewline^0)) - lex:add_rule('number', token(lexer.NUMBER, lexer.float + lexer.integer)) + lex:add_rule('string', token(lexer.STRING, lexer.range('"'))) + lex:add_rule('comment', token(lexer.COMMENT, lexer.to_eol('#'))) + lex:add_rule('number', token(lexer.NUMBER, lexer.number)) lex:add_rule('operator', token(lexer.OPERATOR, S('+-*/%^=<>,.()[]{}'))) lex:add_fold_point(lexer.OPERATOR, '{', '}') @@ -1463,7 +1458,7 @@ operator 30

Acknowledgements

Thanks to Peter Odding for his lexer post on the Lua mailing list - that inspired me, and thanks to Roberto Ierusalimschy for LPeg.

+ that provided inspiration, and thanks to Roberto Ierusalimschy for LPeg.

Lua lexer module API fields

@@ -1869,6 +1864,13 @@ operator 30

A pattern that matches any single, non-newline character or any set of end of line characters escaped with '\'.

+

+ +

lexer.number (pattern)

+ +

A pattern that matches a typical number, either a floating point, decimal, + hexadecimal, or octal number.

+

lexer.oct_num (pattern)

@@ -2071,58 +2073,6 @@ operator 30 -

- -

lexer.delimited_range (chars, single_line, no_escape, balanced)

- -

Creates and returns a pattern that matches a range of text bounded by - chars characters. - This is a convenience function for matching more complicated delimited ranges - like strings with escape characters and balanced parentheses. single_line - indicates whether or not the range must be on a single line, no_escape - indicates whether or not to ignore '\' as an escape character, and balanced - indicates whether or not to handle balanced ranges like parentheses and - requires chars to be composed of two characters.

- -

Fields:

- - - - -

Usage:

- - - - -

Return:

- - - - -

See also:

- - - -

lexer.embed (lexer, child, start_rule, end_rule)

@@ -2241,7 +2191,7 @@ operator 30 @@ -2344,44 +2294,6 @@ operator 30 -

- -

lexer.nested_pair (start_chars, end_chars)

- -

Returns a pattern that matches a balanced range of text that starts with - string start_chars and ends with string end_chars. - With single-character delimiters, this function is identical to - delimited_range(start_chars..end_chars, false, true, true).

- -

Fields:

- - - - -

Usage:

- - - - -

Return:

- - - - -

See also:

- - - -

lexer.new (name, opts)

@@ -2420,6 +2332,54 @@ operator 30 +

+ +

lexer.range(s, e, single_line, escapes, balanced)

+ +

Creates and returns a pattern that matches a range of text bounded by strings + or patterns s and e. + This is a convenience function for matching more complicated ranges like + strings with escape characters, balanced parentheses, and block comments + (nested or not). e is optional and defaults to s. single_line indicates + whether or not the range must be on a single line; escapes indicates + whether or not to allow ‘\’ as an escape character; and balanced indicates + whether or not to handle balanced ranges like parentheses, and requires s + and e to be different.

+ +

Parameters:

+ + + + +

Usage:

+ + + + +

Return:

+ + + +

lexer.starts_line (patt)

@@ -2449,6 +2409,39 @@ operator 30 +

+ +

lexer.to_eol(prefix, escape)

+ +

Creates and returns a pattern that matches from string or pattern prefix + until the end of the line. + escape indicates whether the end of the line can be escaped with a ‘\’ + character.

+ +

Parameters:

+ + + + +

Usage:

+ + + + +

Return:

+ + + +

lexer.token (name, patt)

-- cgit v1.2.3