diff options
Diffstat (limited to 'mods/src/em_scan.erl')
-rw-r--r-- | mods/src/em_scan.erl | 381 |
1 files changed, 381 insertions, 0 deletions
diff --git a/mods/src/em_scan.erl b/mods/src/em_scan.erl new file mode 100644 index 0000000..5467423 --- /dev/null +++ b/mods/src/em_scan.erl @@ -0,0 +1,381 @@ +%%%---------------------------------------------------------------------- +%%% File : em_scan.erl +%%% Author : Luke Gorrie <luke@bluetail.com> +%%% Purpose : Scanner for leex-style DFAs. +%%% Created : 1 May 2001 by Luke Gorrie <luke@bluetail.com> +%%%---------------------------------------------------------------------- + +-module(em_scan). +-author('luke@bluetail.com'). + +-record(scanner, {initial_state, dfa_fn, action_fn}). + +%% Token record. The actual lexeme, if returned by the scanner, is +%% discarded - all that's taken from the user-written "actions" is the +%% token type. Instead we just keep the start/finish positions in a +%% cord. +%% +%% Not retaining line/column information. Hopefully that's convenient +%% to derive later. +-record(token, + { + %% type chosen by scanner. There is also a special type: + %% 'em_skipped', when the scanner returns 'skip_token' but + %% we record it anyway. + type, + %% start position in cord + start, + %% finish position in cord (point *after* the last character) + finish, + %% number of characters beyond the end of the lexeme that + %% the scanner state machine examined, recorded to track + %% dependency + lookahead, + %% for book keeping, saying whether the token is known to + %% need re-scanning + dirty=false + }). + +-compile(export_all). +%%-export([Function/Arity, ...]). + +%% Bogus line number to pass to leex/yecc, because I don't want to +%% track lines at this low level. It's not safe to use an atom, but +%% hopefully -42 looks deliberate enough to prompt a grep :-) + +-define(line, -42). %% Confusing number appearing where line # should be :-) + +make_scanner(State0, DFA, Action) -> + #scanner{initial_state=State0, + dfa_fn=DFA, + action_fn=Action}. + +make_scheme_scanner() -> make_leex_scanner(em_scheme_scan). +make_erlang_scanner() -> make_leex_scanner(em_erlang_scan). +make_test_scanner() -> make_leex_scanner(em_test_scan). + +make_leex_scanner(Mod) -> + make_scanner(Mod:yystate(), + {Mod, yystate}, + {Mod, yyaction}). + +scan_annotation(Scanner, _S0, Cord, RText, Start, End) -> + Walker = cord:walker(Cord), + case edit_scan(Scanner, Walker) of + {ok, Toks} -> + {ok, Toks}; + {error, Reason} -> + {ok, bad_scan} + end. + +test_test(Str) -> + case test_string(Str) of + {ok, Toks} -> + {ok, lists:map(fun(T) -> element(1, T) end, + Toks)}; + X -> + X + end. + +erlang_test(Str) -> + case erlang_string(Str) of + {ok, Toks} -> + {ok, lists:map(fun(T) -> element(1, T) end, + Toks)}; + X -> + X + end. + +scheme_test(Str) -> + case scheme_string(Str) of + {ok, Toks} -> + {ok, lists:map(fun(T) -> element(1, T) end, + Toks)}; + X -> + X + end. + +test_string(Str) -> + Cord = list_to_binary(Str), + Scan = make_test_scanner(), + edit_scan(Scan, cord:walker(Cord)). + +scheme_string(Str) -> + Cord = list_to_binary(Str), + Scan = make_scheme_scanner(), + edit_scan(Scan, cord:walker(Cord)). + +erlang_string(Str) -> + Cord = list_to_binary(Str), + Scan = make_erlang_scanner(), + edit_scan(Scan, cord:walker(Cord)). + +%% Returns: {ok, [#token]} | {error, Reason} +edit_scan(Scn, Walker) -> + edit_scan(Scn, Walker, 1, []). + +edit_scan(Scn, Walker, Pos) -> + edit_scan(Scn, Walker, 1, []). + +edit_scan(Scn, W0, Pos, Acc) -> + case token(Scn, W0) of + {done, eof} -> + {ok, lists:reverse(Acc)}; + {done, Result, W1} -> + Token = make_token(Pos, Result), + edit_scan(Scn, W1, Token#token.finish, [Token|Acc]); + {error, Reason} -> + {error, Reason} + end. + +make_skipped_token(Pos, Len, LookAhead) -> + #token{type=em_skipped, + start=Pos, + finish=Pos+Len, + lookahead=LookAhead}. + +% tokens(Scn, Walker) -> +% tokens(Scn, Walker, []). + +% tokens(Scn, W0, Acc) -> +% case token(Scn, W0) of +% {done, {ok, T, Acs}, W1} -> +% tokens(Scn, W1, [T|Acc]); +% {done, {skip_token, Acs}, W1} -> +% tokens(Scn, W1, Acc); +% {done, eof} -> +% {ok, lists:reverse(Acc)}; +% {error, Reason} -> +% {error, Reason} +% end. + +token(Scn, Walker) -> + State0 = Scn#scanner.initial_state, + token(Scn, Walker, State0, [], 0, reject, 0). + +token(Scn, W0, S0, Tcs0, Tlen0, A0, Alen0) -> + ActionF = Scn#scanner.action_fn, + DFA_F = Scn#scanner.dfa_fn, + {Ics, Tcs1, W1} = case cord:walker_next(W0) of + {done, WEOF} -> + {eof, Tcs0, WEOF}; + {Ch, Wnext} -> + {[Ch], [Ch|Tcs0], Wnext} + end, + case DFA_F(S0, Ics, ?line, Tlen0, A0, Alen0) of + {A1,Alen1,[],L1} -> % accepting end state + TcsFwd = lists:reverse(Tcs1), + token_cont(Scn,TcsFwd,W1,[],ActionF(A1,Alen1,TcsFwd,?line)); + {A1,Alen1,[],L1,S1} -> % after accepting state + token(Scn,W1,S1,Tcs1,Alen1,A1,Alen1); + {A1,Alen1,Ics1,L1,S1} -> % accepting state with leftover + % chars.. sounds like a time + % to act. + TcsFwd = lists:reverse(Tcs1), + Acs = yypre(TcsFwd, Alen1), + token_cont(Scn,Acs,W1,Ics1,ActionF(A1,Alen1,TcsFwd,?line)); + {A1,Alen1,Tlen1,[],L1,S1} -> % after a non-accepting state + token(Scn,W1,S1,Tcs1,Tlen1,A1,Alen1); + {reject,Alen1,Tlen1,eof,L1,S1} -> + {done,eof}; + {reject,Alen1,Tlen1,Ics1,L1,S1} -> + {done, + {error,{illegal,yypre(Tcs1, Tlen1+1)}}, + Ics1}; + {A1,Alen1,Tlen1,Ics1,L1,S1} -> + TcsFwd = lists:reverse(Tcs1), + {Acs, Rest} = yysplit(TcsFwd, Alen1), + token_cont(Scn,Acs,W1,Rest,ActionF(A1,Alen1,TcsFwd,?line)) + end. + +token_cont(Scn, Acs, W, Rest, {token, T}) -> + {done, {ok, T, Acs, length_of(Rest)}, pushback(W, Rest)}; +token_cont(Scn, Acs, W, Rest, {end_token, T}) -> + {done, {ok, T, Acs, length_of(Rest)}, pushback(W, Rest)}; +token_cont(Scn, Acs, W, Rest, SkipToken) -> + {done, {skip_token, Acs, length_of(Rest)}, pushback(W, Rest)}; +token_cont(Scn, Acs, W, Rest, {error, S}) -> + {done, {error, {user,S}}, pushback(W, Rest)}. + +adjust_pos(C, L, [$\n|T]) -> adjust_pos(0, L+1, T); +adjust_pos(C, L, [_|T]) -> adjust_pos(C+1, L, T); +adjust_pos(C, L, []) -> {C, L}. + +length_of(eof) -> 0; +length_of(List) -> length(List). + +pushback(W, eof) -> + W; +pushback(W, Chars) -> + lists:foldr(fun(C, Wn) -> cord:walker_push(C, Wn) end, + W, + Chars). + +yyrev([H|T], Acc) -> yyrev(T, [H|Acc]); +yyrev([], Acc) -> Acc. + +yypre([H|T], N) when N > 0 -> [H|yypre(T, N-1)]; +yypre(L, N) -> []. + +yysuf([H|T], N) when N > 0 -> yysuf(T, N-1); +yysuf(L, 0) -> L. + +yysplit(L, N) -> yysplit(L, N, []). +yysplit([H|T], N, Acc) when N > 0 -> yysplit(T, N-1, [H|Acc]); +yysplit(L, 0, Acc) -> {lists:reverse(Acc), L}. + +%%%%% +%% Experimentation with incremental parsing (working) + +str0() -> "-bar+ +foo-". +toks0() -> + {ok, Toks} = test_string(str0()), + Toks. + +%% Hand-hacked version of Toks0 with the space character deleted and +%% its token marked as dirty. When properly re-scanned, this should be +%% the same as toks1 (basis of the test case) +dirty_toks() -> [OK_A,OK_B,OK_C,Changed0|Following0] = toks0(), + Changed1 = Changed0#token{dirty=true}, + Following1 = lists:map(fun(T) -> + Start = T#token.start, + Finish = T#token.finish, + T#token{start=Start-1, + finish=Finish-1} + end, + Following0), + [OK_A,OK_B,OK_C,Changed1|Following1]. + +%% for: "bar++foo" +%% The space has been changed (now length 0) and is marked dirty. +str1() -> str0() -- " ". +toks1() -> + {ok, Toks} = test_string(str1()), + Toks. + +rescan_test() -> + Scn = make_test_scanner(), + Dirty = mark_dirty(dirty_toks()), + Cord = cord:new(str1()), + Result = rescan(Scn, Cord, Dirty), + {Result == toks1(), Result, Dirty}. + +%% What should happen with this simple algorithm and test case: +%% 1. We see that a token is dirty. +%% 2. We scan backwards for dependencies (by seeing what has a look-ahead +%% that reaches the dirty token, or another token that does, etc) +%% 3. From the earliest dependency, we start re-scanning everything +%% until we scan a token which leaves the state of the lexer the same as +%% it was previously (i.e. so the following token is known to be +%% unchanged) +%% +%% So, how is the state of the lexer defined? In leex it seems to me +%% that the internal scanner state is always the same at the start of +%% each token (barring anything magical done in actions - which for +%% now I ignore). So, I think it's safe to assume that a token will be +%% unchanged if both the chars in its lexeme and the ones reached by +%% its look-ahead are the same. For "tricky stuff" it may be necessary +%% to capture the process dictionary and include (some of) it in the +%% state. +%% +%% So I will terminate when I reach a token beyond the changed text +%% which is starting in the same place. +%% +%% NB: We shouldn't need to go *too* far backwards when marking +%% dependencies in languages I can think of, because of common +%% zero-lookahead tokens like: \n , ; ( ) ...etc and most other +%% tokens will just be 1-lookahead. + +re_lex(Toks0) -> + Toks1 = mark_dirty(Toks0). + +mark_dirty(Toks) -> + mark_dirty(Toks, []). + +mark_dirty([H|T], Acc) when H#token.dirty == true -> + mark_dirty1([H|T], H#token.start, Acc); +mark_dirty([H|T], Acc) -> + mark_dirty(T, [H|Acc]); +mark_dirty([], Acc) -> + lists:reverse(Acc). + +mark_dirty1(OK, DirtyPos, Toks0) -> + F = fun(Tok, DP) -> + case (Tok#token.finish-1) + Tok#token.lookahead of + P when P >= DP -> + {Tok#token{dirty=true}, + Tok#token.start}; + _ -> + {Tok, DP} + end + end, + {Toks1, _} = lists:mapfoldl(F, DirtyPos, Toks0), + lists:reverse(Toks1) ++ OK. + +%% Rescan dirty tokens. +rescan(Scn, Cord, []) -> + []; +rescan(Scn, Cord, [H|T]) when H#token.dirty == false -> + [H|rescan(Scn, Cord, T)]; +rescan(Scn, Cord, [H|T]) when H#token.dirty == true -> + Pos = H#token.start, + {_, Region} = cord:split(Cord, Pos-1), + Walker = cord:walker(Region), + rescan_dirty(Scn, Walker, Pos, [H|T]). + +%% rescan_dirty(Toks) +%% +%% The first token is dirty. Scan until we get back to a sane state +rescan_dirty(Scn, W0, Pos, [Tok|Toks]) -> + Start = Tok#token.start, + io:format("(Pos = ~p) Rescanning ~p~n", [Pos, Tok]), + case token(Scn, W0) of + {done, eof} -> + []; + {done, Result, W1} -> + Token = make_token(Pos, Result), + [Token|rescan_dirty_cont(Scn, W1, Token#token.finish, Toks)]; + {error, Reason} -> + %% FIXME: should make an error-token and carry on + {error, Reason} + end. + +rescan_dirty_cont(Scn, W, Pos, []) -> + []; +rescan_dirty_cont(Scn, W, Pos, Rest) -> + Next = hd(Rest), + if + %% This token no longer exists! + Next#token.finish =< Pos -> + io:format("(Pos = ~p) Discaring token: ~p~n", [Pos, Next]), + rescan_dirty_cont(Scn, W, Pos, tl(Rest)); + %% We need to carry on if the token is known to be dirty, or + %% if we aren't at the same place that it was scanned from + %% before + Next#token.dirty == true; + Next#token.start /= Pos -> + rescan_dirty(Scn, W, Pos, Rest); + true -> + Rest + end. + +make_token(Pos, {ok, T, Acs, LookAhead}) -> + Type = element(1, T), + Len = length(Acs), + #token{type=Type, + start=Pos, + finish=Pos+Len, + lookahead=LookAhead}; +make_token(Pos, {skip_token, Acs, LookAhead}) -> + Len = length(Acs), + #token{type=em_skipped, + start=Pos, + finish=Pos+Len, + lookahead=LookAhead}. + +make_error_token(Pos) -> + #token{type=em_error, + start=Pos, + finish=Pos+1, + lookahead=1}. + |