diff options
Diffstat (limited to 'lexlua/awk.lua')
-rw-r--r-- | lexlua/awk.lua | 297 |
1 files changed, 297 insertions, 0 deletions
diff --git a/lexlua/awk.lua b/lexlua/awk.lua new file mode 100644 index 000000000..a3f69fd83 --- /dev/null +++ b/lexlua/awk.lua @@ -0,0 +1,297 @@ +-- Copyright 2006-2018 Mitchell mitchell.att.foicica.com. See License.txt. +-- AWK LPeg lexer. +-- Modified by Wolfgang Seeberg 2012, 2013. + +local lexer = require('lexer') +local token, word_match = lexer.token, lexer.word_match +local P, R, S = lpeg.P, lpeg.R, lpeg.S + +local lex = lexer.new('awk') + +local LEFTBRACKET = '[' +local RIGHTBRACKET = ']' +local SLASH = '/' +local BACKSLASH = '\\' +local CARET = '^' +local CR = '\r' +local LF = '\n' +local CRLF = CR .. LF +local DQUOTE = '"' +local DELIMITER_MATCHES = {['('] = ')', ['['] = ']'} +local COMPANION = {['('] = '[', ['['] = '('} +local CC = { + alnum = 1, alpha = 1, blank = 1, cntrl = 1, digit = 1, graph = 1, lower = 1, + print = 1, punct = 1, space = 1, upper = 1, xdigit = 1 +} +local LastRegexEnd = 0 +local BackslashAtCommentEnd = 0 +local KW_BEFORE_RX = { + case = 1, ['do'] = 1, ['else'] = 1, exit = 1, print = 1, printf = 1, + ['return'] = 1 +} + +local function findKeyword(input, e) + local i = e + while i > 0 and input:find("^[%l]", i) do i = i - 1 end + local w = input:sub(i + 1, e) + if i == 0 then + return KW_BEFORE_RX[w] == 1 + elseif input:find("^[%u%d_]", i) then + return false + else + return KW_BEFORE_RX[w] == 1 + end +end + +local function isRegex(input, i) + while i >= 1 and input:find('^[ \t]', i) do i = i - 1 end + if i < 1 then return true end + if input:find("^[-!%%&(*+,:;<=>?[^{|}~\f]", i) or findKeyword(input, i) then + return true + elseif input:sub(i, i) == SLASH then + return i ~= LastRegexEnd -- deals with /xx/ / /yy/. + elseif input:find('^[]%w)."]', i) then + return false + elseif input:sub(i, i) == LF then + if i == 1 then return true end + i = i - 1 + if input:sub(i, i) == CR then + if i == 1 then return true end + i = i - 1 + end + elseif input:sub(i, i) == CR then + if i == 1 then return true end + i = i - 1 + else + return false + end + if input:sub(i, i) == BACKSLASH and i ~= BackslashAtCommentEnd then + return isRegex(input, i - 1) + else + return true + end +end + +local function eatCharacterClass(input, s, e) + local i = s + while i <= e do + if input:find('^[\r\n]', i) then + return false + elseif input:sub(i, i + 1) == ':]' then + local str = input:sub(s, i - 1) + return CC[str] == 1 and i + 1 + end + i = i + 1 + end + return false +end + +local function eatBrackets(input, i, e) + if input:sub(i, i) == CARET then i = i + 1 end + if input:sub(i, i) == RIGHTBRACKET then i = i + 1 end + while i <= e do + if input:find('^[\r\n]', i) then + return false + elseif input:sub(i, i) == RIGHTBRACKET then + return i + elseif input:sub(i, i + 1) == '[:' then + i = eatCharacterClass(input, i + 2, e) + if not i then return false end + elseif input:sub(i, i) == BACKSLASH then + i = i + 1 + if input:sub(i, i + 1) == CRLF then i = i + 1 end + end + i = i + 1 + end + return false +end + +local function eatRegex(input, i) + local e = #input + while i <= e do + if input:find('^[\r\n]', i) then + return false + elseif input:sub(i, i) == SLASH then + LastRegexEnd = i + return i + elseif input:sub(i, i) == LEFTBRACKET then + i = eatBrackets(input, i + 1, e) + if not i then return false end + elseif input:sub(i, i) == BACKSLASH then + i = i + 1 + if input:sub(i, i + 1) == CRLF then i = i + 1 end + end + i = i + 1 + end + return false +end + +local ScanRegexResult +local function scanGawkRegex(input, index) + if isRegex(input, index - 2) then + local i = eatRegex(input, index) + if not i then + ScanRegexResult = false + return false + end + local rx = input:sub(index - 1, i) + for bs in rx:gmatch("[^\\](\\+)[BSsWwy<>`']") do + -- /\S/ is special, but /\\S/ is not. + if #bs % 2 == 1 then return i + 1 end + end + ScanRegexResult = i + 1 + else + ScanRegexResult = false + end + return false +end +-- Is only called immediately after scanGawkRegex(). +local function scanRegex() + return ScanRegexResult +end + +local function scanString(input, index) + local i = index + local e = #input + while i <= e do + if input:find('^[\r\n]', i) then + return false + elseif input:sub(i, i) == DQUOTE then + return i + 1 + elseif input:sub(i, i) == BACKSLASH then + i = i + 1 + -- lexer.delimited_range() doesn't handle CRLF. + if input:sub(i, i + 1) == CRLF then i = i + 1 end + end + i = i + 1 + end + return false +end + +-- purpose: prevent isRegex() from entering a comment line that ends with a +-- backslash. +local function scanComment(input, index) + local _, i = input:find('[^\r\n]*', index) + if input:sub(i, i) == BACKSLASH then BackslashAtCommentEnd = i end + return i + 1 +end + +local function scanFieldDelimiters(input, index) + local i = index + local e = #input + local left = input:sub(i - 1, i - 1) + local count = 1 + local right = DELIMITER_MATCHES[left] + local left2 = COMPANION[left] + local count2 = 0 + local right2 = DELIMITER_MATCHES[left2] + while i <= e do + if input:find('^[#\r\n]', i) then + return false + elseif input:sub(i, i) == right then + count = count - 1 + if count == 0 then return count2 == 0 and i + 1 end + elseif input:sub(i, i) == left then + count = count + 1 + elseif input:sub(i, i) == right2 then + count2 = count2 - 1 + if count2 < 0 then return false end + elseif input:sub(i, i) == left2 then + count2 = count2 + 1 + elseif input:sub(i, i) == DQUOTE then + i = scanString(input, i + 1) + if not i then return false end + i = i - 1 + elseif input:sub(i, i) == SLASH then + if isRegex(input, i - 1) then + i = eatRegex(input, i + 1) + if not i then return false end + end + elseif input:sub(i, i) == BACKSLASH then + if input:sub(i + 1, i + 2) == CRLF then + i = i + 2 + elseif input:find('^[\r\n]', i + 1) then + i = i + 1 + end + end + i = i + 1 + end + return false +end + +-- Whitespace. +lex:add_rule('whitespace', token(lexer.WHITESPACE, lexer.space^1)) + +-- Comments. +lex:add_rule('comment', token(lexer.COMMENT, '#' * P(scanComment))) + +-- Strings. +lex:add_rule('string', token(lexer.STRING, DQUOTE * P(scanString))) + +-- No leading sign because it might be binary. +local float = ((lexer.digit^1 * ('.' * lexer.digit^0)^-1) + + ('.' * lexer.digit^1)) * + (S('eE') * S('+-')^-1 * lexer.digit^1)^-1 + +-- Fields. E.g. $1, $a, $(x), $a(x), $a[x], $"1", $$a, etc. +lex:add_rule('field', + token('field', P('$') * S('$+-')^0 * + (float + + lexer.word^0 * '(' * P(scanFieldDelimiters) + + lexer.word^1 * ('[' * P(scanFieldDelimiters))^-1 + + '"' * P(scanString) + + '/' * P(eatRegex) * '/'))) +lex:add_style('field', lexer.STYLE_LABEL) + +-- Regular expressions. +-- Slash delimited regular expressions are preceded by most operators or +-- the keywords 'print' and 'case', possibly on a preceding line. They +-- can contain unescaped slashes and brackets in brackets. Some escape +-- sequences like '\S', '\s' have special meanings with Gawk. Tokens that +-- contain them are displayed differently. +lex:add_rule('gawkRegex', token('gawkRegex', SLASH * P(scanGawkRegex))) +lex:add_style('gawkRegex', lexer.STYLE_PREPROCESSOR..',underlined') +lex:add_rule('regex', token(lexer.REGEX, SLASH * P(scanRegex))) + +-- Operators. +lex:add_rule('gawkOperator', token('gawkOperator', P("|&") + "@" + "**=" + + "**")) +lex:add_style('gawkOperator', lexer.STYLE_OPERATOR..',underlined') +lex:add_rule('operator', token(lexer.OPERATOR, S('!%&()*+,-/:;<=>?[\\]^{|}~'))) + +-- Numbers. +lex:add_rule('gawkNumber', token('gawkNumber', lexer.hex_num + lexer.oct_num)) +lex:add_style('gawkNumber', lexer.STYLE_NUMBER..',underlined') +lex:add_rule('number', token(lexer.NUMBER, float)) + +-- Keywords. +lex:add_rule('keyword', token(lexer.KEYWORD, word_match[[ + BEGIN END atan2 break close continue cos delete do else exit exp fflush for + function getline gsub if in index int length log match next nextfile print + printf rand return sin split sprintf sqrt srand sub substr system tolower + toupper while +]])) + +lex:add_rule('builtInVariable', token('builtInVariable', word_match[[ + ARGC ARGV CONVFMT ENVIRON FILENAME FNR FS NF NR OFMT OFS ORS RLENGTH RS RSTART + SUBSEP +]])) +lex:add_style('builtInVariable', lexer.STYLE_CONSTANT) + +lex:add_rule('gawkBuiltInVariable', token('gawkBuiltInVariable', word_match[[ + ARGIND BINMODE ERRNO FIELDWIDTHS FPAT FUNCTAB IGNORECASE LINT PREC PROCINFO + ROUNDMODE RT SYMTAB TEXTDOMAIN +]])) +lex:add_style('gawkBuiltInVariable', lexer.STYLE_CONSTANT..',underlined') + +-- Functions. +lex:add_rule('function', token(lexer.FUNCTION, lexer.word * #P('('))) + +-- Identifiers. +lex:add_rule('identifier', token(lexer.IDENTIFIER, lexer.word)) + +-- Fold points. +lex:add_fold_point(lexer.OPERATOR, '{', '}') +lex:add_fold_point(lexer.COMMENT, '#', lexer.fold_line_comments('#')) + +return lex |