aboutsummaryrefslogtreecommitdiffhomepage
path: root/mods/src/em_scan.erl
diff options
context:
space:
mode:
Diffstat (limited to 'mods/src/em_scan.erl')
-rw-r--r--mods/src/em_scan.erl381
1 files changed, 381 insertions, 0 deletions
diff --git a/mods/src/em_scan.erl b/mods/src/em_scan.erl
new file mode 100644
index 0000000..5467423
--- /dev/null
+++ b/mods/src/em_scan.erl
@@ -0,0 +1,381 @@
+%%%----------------------------------------------------------------------
+%%% File : em_scan.erl
+%%% Author : Luke Gorrie <luke@bluetail.com>
+%%% Purpose : Scanner for leex-style DFAs.
+%%% Created : 1 May 2001 by Luke Gorrie <luke@bluetail.com>
+%%%----------------------------------------------------------------------
+
+-module(em_scan).
+-author('luke@bluetail.com').
+
+-record(scanner, {initial_state, dfa_fn, action_fn}).
+
+%% Token record. The actual lexeme, if returned by the scanner, is
+%% discarded - all that's taken from the user-written "actions" is the
+%% token type. Instead we just keep the start/finish positions in a
+%% cord.
+%%
+%% Not retaining line/column information. Hopefully that's convenient
+%% to derive later.
+-record(token,
+ {
+ %% type chosen by scanner. There is also a special type:
+ %% 'em_skipped', when the scanner returns 'skip_token' but
+ %% we record it anyway.
+ type,
+ %% start position in cord
+ start,
+ %% finish position in cord (point *after* the last character)
+ finish,
+ %% number of characters beyond the end of the lexeme that
+ %% the scanner state machine examined, recorded to track
+ %% dependency
+ lookahead,
+ %% for book keeping, saying whether the token is known to
+ %% need re-scanning
+ dirty=false
+ }).
+
+-compile(export_all).
+%%-export([Function/Arity, ...]).
+
+%% Bogus line number to pass to leex/yecc, because I don't want to
+%% track lines at this low level. It's not safe to use an atom, but
+%% hopefully -42 looks deliberate enough to prompt a grep :-)
+
+-define(line, -42). %% Confusing number appearing where line # should be :-)
+
+make_scanner(State0, DFA, Action) ->
+ #scanner{initial_state=State0,
+ dfa_fn=DFA,
+ action_fn=Action}.
+
+make_scheme_scanner() -> make_leex_scanner(em_scheme_scan).
+make_erlang_scanner() -> make_leex_scanner(em_erlang_scan).
+make_test_scanner() -> make_leex_scanner(em_test_scan).
+
+make_leex_scanner(Mod) ->
+ make_scanner(Mod:yystate(),
+ {Mod, yystate},
+ {Mod, yyaction}).
+
+scan_annotation(Scanner, _S0, Cord, RText, Start, End) ->
+ Walker = cord:walker(Cord),
+ case edit_scan(Scanner, Walker) of
+ {ok, Toks} ->
+ {ok, Toks};
+ {error, Reason} ->
+ {ok, bad_scan}
+ end.
+
+test_test(Str) ->
+ case test_string(Str) of
+ {ok, Toks} ->
+ {ok, lists:map(fun(T) -> element(1, T) end,
+ Toks)};
+ X ->
+ X
+ end.
+
+erlang_test(Str) ->
+ case erlang_string(Str) of
+ {ok, Toks} ->
+ {ok, lists:map(fun(T) -> element(1, T) end,
+ Toks)};
+ X ->
+ X
+ end.
+
+scheme_test(Str) ->
+ case scheme_string(Str) of
+ {ok, Toks} ->
+ {ok, lists:map(fun(T) -> element(1, T) end,
+ Toks)};
+ X ->
+ X
+ end.
+
+test_string(Str) ->
+ Cord = list_to_binary(Str),
+ Scan = make_test_scanner(),
+ edit_scan(Scan, cord:walker(Cord)).
+
+scheme_string(Str) ->
+ Cord = list_to_binary(Str),
+ Scan = make_scheme_scanner(),
+ edit_scan(Scan, cord:walker(Cord)).
+
+erlang_string(Str) ->
+ Cord = list_to_binary(Str),
+ Scan = make_erlang_scanner(),
+ edit_scan(Scan, cord:walker(Cord)).
+
+%% Returns: {ok, [#token]} | {error, Reason}
+edit_scan(Scn, Walker) ->
+ edit_scan(Scn, Walker, 1, []).
+
+edit_scan(Scn, Walker, Pos) ->
+ edit_scan(Scn, Walker, 1, []).
+
+edit_scan(Scn, W0, Pos, Acc) ->
+ case token(Scn, W0) of
+ {done, eof} ->
+ {ok, lists:reverse(Acc)};
+ {done, Result, W1} ->
+ Token = make_token(Pos, Result),
+ edit_scan(Scn, W1, Token#token.finish, [Token|Acc]);
+ {error, Reason} ->
+ {error, Reason}
+ end.
+
+make_skipped_token(Pos, Len, LookAhead) ->
+ #token{type=em_skipped,
+ start=Pos,
+ finish=Pos+Len,
+ lookahead=LookAhead}.
+
+% tokens(Scn, Walker) ->
+% tokens(Scn, Walker, []).
+
+% tokens(Scn, W0, Acc) ->
+% case token(Scn, W0) of
+% {done, {ok, T, Acs}, W1} ->
+% tokens(Scn, W1, [T|Acc]);
+% {done, {skip_token, Acs}, W1} ->
+% tokens(Scn, W1, Acc);
+% {done, eof} ->
+% {ok, lists:reverse(Acc)};
+% {error, Reason} ->
+% {error, Reason}
+% end.
+
+token(Scn, Walker) ->
+ State0 = Scn#scanner.initial_state,
+ token(Scn, Walker, State0, [], 0, reject, 0).
+
+token(Scn, W0, S0, Tcs0, Tlen0, A0, Alen0) ->
+ ActionF = Scn#scanner.action_fn,
+ DFA_F = Scn#scanner.dfa_fn,
+ {Ics, Tcs1, W1} = case cord:walker_next(W0) of
+ {done, WEOF} ->
+ {eof, Tcs0, WEOF};
+ {Ch, Wnext} ->
+ {[Ch], [Ch|Tcs0], Wnext}
+ end,
+ case DFA_F(S0, Ics, ?line, Tlen0, A0, Alen0) of
+ {A1,Alen1,[],L1} -> % accepting end state
+ TcsFwd = lists:reverse(Tcs1),
+ token_cont(Scn,TcsFwd,W1,[],ActionF(A1,Alen1,TcsFwd,?line));
+ {A1,Alen1,[],L1,S1} -> % after accepting state
+ token(Scn,W1,S1,Tcs1,Alen1,A1,Alen1);
+ {A1,Alen1,Ics1,L1,S1} -> % accepting state with leftover
+ % chars.. sounds like a time
+ % to act.
+ TcsFwd = lists:reverse(Tcs1),
+ Acs = yypre(TcsFwd, Alen1),
+ token_cont(Scn,Acs,W1,Ics1,ActionF(A1,Alen1,TcsFwd,?line));
+ {A1,Alen1,Tlen1,[],L1,S1} -> % after a non-accepting state
+ token(Scn,W1,S1,Tcs1,Tlen1,A1,Alen1);
+ {reject,Alen1,Tlen1,eof,L1,S1} ->
+ {done,eof};
+ {reject,Alen1,Tlen1,Ics1,L1,S1} ->
+ {done,
+ {error,{illegal,yypre(Tcs1, Tlen1+1)}},
+ Ics1};
+ {A1,Alen1,Tlen1,Ics1,L1,S1} ->
+ TcsFwd = lists:reverse(Tcs1),
+ {Acs, Rest} = yysplit(TcsFwd, Alen1),
+ token_cont(Scn,Acs,W1,Rest,ActionF(A1,Alen1,TcsFwd,?line))
+ end.
+
+token_cont(Scn, Acs, W, Rest, {token, T}) ->
+ {done, {ok, T, Acs, length_of(Rest)}, pushback(W, Rest)};
+token_cont(Scn, Acs, W, Rest, {end_token, T}) ->
+ {done, {ok, T, Acs, length_of(Rest)}, pushback(W, Rest)};
+token_cont(Scn, Acs, W, Rest, SkipToken) ->
+ {done, {skip_token, Acs, length_of(Rest)}, pushback(W, Rest)};
+token_cont(Scn, Acs, W, Rest, {error, S}) ->
+ {done, {error, {user,S}}, pushback(W, Rest)}.
+
+adjust_pos(C, L, [$\n|T]) -> adjust_pos(0, L+1, T);
+adjust_pos(C, L, [_|T]) -> adjust_pos(C+1, L, T);
+adjust_pos(C, L, []) -> {C, L}.
+
+length_of(eof) -> 0;
+length_of(List) -> length(List).
+
+pushback(W, eof) ->
+ W;
+pushback(W, Chars) ->
+ lists:foldr(fun(C, Wn) -> cord:walker_push(C, Wn) end,
+ W,
+ Chars).
+
+yyrev([H|T], Acc) -> yyrev(T, [H|Acc]);
+yyrev([], Acc) -> Acc.
+
+yypre([H|T], N) when N > 0 -> [H|yypre(T, N-1)];
+yypre(L, N) -> [].
+
+yysuf([H|T], N) when N > 0 -> yysuf(T, N-1);
+yysuf(L, 0) -> L.
+
+yysplit(L, N) -> yysplit(L, N, []).
+yysplit([H|T], N, Acc) when N > 0 -> yysplit(T, N-1, [H|Acc]);
+yysplit(L, 0, Acc) -> {lists:reverse(Acc), L}.
+
+%%%%%
+%% Experimentation with incremental parsing (working)
+
+str0() -> "-bar+ +foo-".
+toks0() ->
+ {ok, Toks} = test_string(str0()),
+ Toks.
+
+%% Hand-hacked version of Toks0 with the space character deleted and
+%% its token marked as dirty. When properly re-scanned, this should be
+%% the same as toks1 (basis of the test case)
+dirty_toks() -> [OK_A,OK_B,OK_C,Changed0|Following0] = toks0(),
+ Changed1 = Changed0#token{dirty=true},
+ Following1 = lists:map(fun(T) ->
+ Start = T#token.start,
+ Finish = T#token.finish,
+ T#token{start=Start-1,
+ finish=Finish-1}
+ end,
+ Following0),
+ [OK_A,OK_B,OK_C,Changed1|Following1].
+
+%% for: "bar++foo"
+%% The space has been changed (now length 0) and is marked dirty.
+str1() -> str0() -- " ".
+toks1() ->
+ {ok, Toks} = test_string(str1()),
+ Toks.
+
+rescan_test() ->
+ Scn = make_test_scanner(),
+ Dirty = mark_dirty(dirty_toks()),
+ Cord = cord:new(str1()),
+ Result = rescan(Scn, Cord, Dirty),
+ {Result == toks1(), Result, Dirty}.
+
+%% What should happen with this simple algorithm and test case:
+%% 1. We see that a token is dirty.
+%% 2. We scan backwards for dependencies (by seeing what has a look-ahead
+%% that reaches the dirty token, or another token that does, etc)
+%% 3. From the earliest dependency, we start re-scanning everything
+%% until we scan a token which leaves the state of the lexer the same as
+%% it was previously (i.e. so the following token is known to be
+%% unchanged)
+%%
+%% So, how is the state of the lexer defined? In leex it seems to me
+%% that the internal scanner state is always the same at the start of
+%% each token (barring anything magical done in actions - which for
+%% now I ignore). So, I think it's safe to assume that a token will be
+%% unchanged if both the chars in its lexeme and the ones reached by
+%% its look-ahead are the same. For "tricky stuff" it may be necessary
+%% to capture the process dictionary and include (some of) it in the
+%% state.
+%%
+%% So I will terminate when I reach a token beyond the changed text
+%% which is starting in the same place.
+%%
+%% NB: We shouldn't need to go *too* far backwards when marking
+%% dependencies in languages I can think of, because of common
+%% zero-lookahead tokens like: \n , ; ( ) ...etc and most other
+%% tokens will just be 1-lookahead.
+
+re_lex(Toks0) ->
+ Toks1 = mark_dirty(Toks0).
+
+mark_dirty(Toks) ->
+ mark_dirty(Toks, []).
+
+mark_dirty([H|T], Acc) when H#token.dirty == true ->
+ mark_dirty1([H|T], H#token.start, Acc);
+mark_dirty([H|T], Acc) ->
+ mark_dirty(T, [H|Acc]);
+mark_dirty([], Acc) ->
+ lists:reverse(Acc).
+
+mark_dirty1(OK, DirtyPos, Toks0) ->
+ F = fun(Tok, DP) ->
+ case (Tok#token.finish-1) + Tok#token.lookahead of
+ P when P >= DP ->
+ {Tok#token{dirty=true},
+ Tok#token.start};
+ _ ->
+ {Tok, DP}
+ end
+ end,
+ {Toks1, _} = lists:mapfoldl(F, DirtyPos, Toks0),
+ lists:reverse(Toks1) ++ OK.
+
+%% Rescan dirty tokens.
+rescan(Scn, Cord, []) ->
+ [];
+rescan(Scn, Cord, [H|T]) when H#token.dirty == false ->
+ [H|rescan(Scn, Cord, T)];
+rescan(Scn, Cord, [H|T]) when H#token.dirty == true ->
+ Pos = H#token.start,
+ {_, Region} = cord:split(Cord, Pos-1),
+ Walker = cord:walker(Region),
+ rescan_dirty(Scn, Walker, Pos, [H|T]).
+
+%% rescan_dirty(Toks)
+%%
+%% The first token is dirty. Scan until we get back to a sane state
+rescan_dirty(Scn, W0, Pos, [Tok|Toks]) ->
+ Start = Tok#token.start,
+ io:format("(Pos = ~p) Rescanning ~p~n", [Pos, Tok]),
+ case token(Scn, W0) of
+ {done, eof} ->
+ [];
+ {done, Result, W1} ->
+ Token = make_token(Pos, Result),
+ [Token|rescan_dirty_cont(Scn, W1, Token#token.finish, Toks)];
+ {error, Reason} ->
+ %% FIXME: should make an error-token and carry on
+ {error, Reason}
+ end.
+
+rescan_dirty_cont(Scn, W, Pos, []) ->
+ [];
+rescan_dirty_cont(Scn, W, Pos, Rest) ->
+ Next = hd(Rest),
+ if
+ %% This token no longer exists!
+ Next#token.finish =< Pos ->
+ io:format("(Pos = ~p) Discaring token: ~p~n", [Pos, Next]),
+ rescan_dirty_cont(Scn, W, Pos, tl(Rest));
+ %% We need to carry on if the token is known to be dirty, or
+ %% if we aren't at the same place that it was scanned from
+ %% before
+ Next#token.dirty == true;
+ Next#token.start /= Pos ->
+ rescan_dirty(Scn, W, Pos, Rest);
+ true ->
+ Rest
+ end.
+
+make_token(Pos, {ok, T, Acs, LookAhead}) ->
+ Type = element(1, T),
+ Len = length(Acs),
+ #token{type=Type,
+ start=Pos,
+ finish=Pos+Len,
+ lookahead=LookAhead};
+make_token(Pos, {skip_token, Acs, LookAhead}) ->
+ Len = length(Acs),
+ #token{type=em_skipped,
+ start=Pos,
+ finish=Pos+Len,
+ lookahead=LookAhead}.
+
+make_error_token(Pos) ->
+ #token{type=em_error,
+ start=Pos,
+ finish=Pos+1,
+ lookahead=1}.
+