%%%----------------------------------------------------------------------
%%% File    : em_scan.erl
%%% Author  : Luke Gorrie <luke@bluetail.com>
%%% Purpose : Scanner for leex-style DFAs.
%%% Created :  1 May 2001 by Luke Gorrie <luke@bluetail.com>
%%%----------------------------------------------------------------------

-module(em_scan).
-author('luke@bluetail.com').

-record(scanner, {initial_state, dfa_fn, action_fn}).

%% Token record. The actual lexeme, if returned by the scanner, is
%% discarded - all that's taken from the user-written "actions" is the
%% token type. Instead we just keep the start/finish positions in a
%% cord.
%%
%% Not retaining line/column information. Hopefully that's convenient
%% to derive later.
-record(token,
	{
	  %% type chosen by scanner. There is also a special type:
	  %% 'em_skipped', when the scanner returns 'skip_token' but
	  %% we record it anyway.
	  type,
	  %% start position in cord
	  start,
	  %% finish position in cord (point *after* the last character)
	  finish,
	  %% number of characters beyond the end of the lexeme that
	  %% the scanner state machine examined, recorded to track
	  %% dependency
	  lookahead,
	  %% for book keeping, saying whether the token is known to
	  %% need re-scanning
	  dirty=false
	 }).

-compile(export_all).
%%-export([Function/Arity, ...]).

%% Bogus line number to pass to leex/yecc, because I don't want to
%% track lines at this low level. It's not safe to use an atom, but
%% hopefully -42 looks deliberate enough to prompt a grep :-)

-define(line, -42). %% Confusing number appearing where line # should be :-)

make_scanner(State0, DFA, Action) ->
    #scanner{initial_state=State0,
	     dfa_fn=DFA,
	     action_fn=Action}.

make_scheme_scanner() -> make_leex_scanner(em_scheme_scan).
make_erlang_scanner() -> make_leex_scanner(em_erlang_scan).
make_test_scanner() -> make_leex_scanner(em_test_scan).

make_leex_scanner(Mod) ->
    make_scanner(Mod:yystate(),
		 {Mod, yystate},
		 {Mod, yyaction}).

scan_annotation(Scanner, _S0, Cord, RText, Start, End) ->
    Walker = cord:walker(Cord),
    case edit_scan(Scanner, Walker) of
	{ok, Toks} ->
	    {ok, Toks};
	{error, Reason} ->
	    {ok, bad_scan}
    end.

test_test(Str) ->
    case test_string(Str) of
	{ok, Toks} ->
	    {ok, lists:map(fun(T) -> element(1, T) end,
			   Toks)};
	X ->
	    X
    end.

erlang_test(Str) ->
    case erlang_string(Str) of
	{ok, Toks} ->
	    {ok, lists:map(fun(T) -> element(1, T) end,
			   Toks)};
	X ->
	    X
    end.

scheme_test(Str) ->
    case scheme_string(Str) of
	{ok, Toks} ->
	    {ok, lists:map(fun(T) -> element(1, T) end,
			   Toks)};
	X ->
	    X
    end.

test_string(Str) ->
    Cord = list_to_binary(Str),
    Scan = make_test_scanner(),
    edit_scan(Scan, cord:walker(Cord)).

scheme_string(Str) ->
    Cord = list_to_binary(Str),
    Scan = make_scheme_scanner(),
    edit_scan(Scan, cord:walker(Cord)).

erlang_string(Str) ->
    Cord = list_to_binary(Str),
    Scan = make_erlang_scanner(),
    edit_scan(Scan, cord:walker(Cord)).

%% Returns: {ok, [#token]} | {error, Reason}
edit_scan(Scn, Walker) ->
    edit_scan(Scn, Walker, 1, []).

edit_scan(Scn, Walker, Pos) ->
    edit_scan(Scn, Walker, 1, []).

edit_scan(Scn, W0, Pos, Acc) ->
    case token(Scn, W0) of
	{done, eof} ->
	    {ok, lists:reverse(Acc)};
	{done, Result, W1} ->
	    Token = make_token(Pos, Result),
	    edit_scan(Scn, W1, Token#token.finish, [Token|Acc]);
	{error, Reason} ->
	    {error, Reason}
    end.

make_skipped_token(Pos, Len, LookAhead) ->
    #token{type=em_skipped,
	   start=Pos,
	   finish=Pos+Len,
	   lookahead=LookAhead}.

% tokens(Scn, Walker) ->
%     tokens(Scn, Walker, []).

% tokens(Scn, W0, Acc) ->
%     case token(Scn, W0) of
% 	{done, {ok, T, Acs}, W1} ->
% 	    tokens(Scn, W1, [T|Acc]);
% 	{done, {skip_token, Acs}, W1} ->
% 	    tokens(Scn, W1, Acc);
% 	{done, eof} ->
% 	    {ok, lists:reverse(Acc)};
% 	{error, Reason} ->
% 	    {error, Reason}
%     end.

token(Scn, Walker) ->
    State0 = Scn#scanner.initial_state,
    token(Scn, Walker, State0, [], 0, reject, 0).

token(Scn, W0, S0, Tcs0, Tlen0, A0, Alen0) ->
    ActionF = Scn#scanner.action_fn,
    DFA_F  = Scn#scanner.dfa_fn,
    {Ics, Tcs1, W1} = case cord:walker_next(W0) of
			     {done, WEOF} ->
				 {eof, Tcs0, WEOF};
			     {Ch, Wnext} ->
				 {[Ch], [Ch|Tcs0], Wnext}
			 end,
    case DFA_F(S0, Ics, ?line, Tlen0, A0, Alen0) of
	{A1,Alen1,[],L1} ->		% accepting end state
	    TcsFwd = lists:reverse(Tcs1),
	    token_cont(Scn,TcsFwd,W1,[],ActionF(A1,Alen1,TcsFwd,?line));
 	{A1,Alen1,[],L1,S1} ->		% after accepting state
	    token(Scn,W1,S1,Tcs1,Alen1,A1,Alen1);
	{A1,Alen1,Ics1,L1,S1} ->	% accepting state with leftover
                                        % chars.. sounds like a time
                                        % to act.
	    TcsFwd = lists:reverse(Tcs1),
	    Acs = yypre(TcsFwd, Alen1),
	    token_cont(Scn,Acs,W1,Ics1,ActionF(A1,Alen1,TcsFwd,?line));
	{A1,Alen1,Tlen1,[],L1,S1} ->	% after a non-accepting state
	    token(Scn,W1,S1,Tcs1,Tlen1,A1,Alen1);
	{reject,Alen1,Tlen1,eof,L1,S1} ->
	    {done,eof};
	{reject,Alen1,Tlen1,Ics1,L1,S1} ->
	    {done,
	     {error,{illegal,yypre(Tcs1, Tlen1+1)}},
	     Ics1};
	{A1,Alen1,Tlen1,Ics1,L1,S1} ->
	    TcsFwd = lists:reverse(Tcs1),
	    {Acs, Rest} = yysplit(TcsFwd, Alen1),
	    token_cont(Scn,Acs,W1,Rest,ActionF(A1,Alen1,TcsFwd,?line))
    end.

token_cont(Scn, Acs, W, Rest, {token, T}) ->
    {done, {ok, T, Acs, length_of(Rest)}, pushback(W, Rest)};
token_cont(Scn, Acs, W, Rest, {end_token, T}) ->
    {done, {ok, T, Acs, length_of(Rest)}, pushback(W, Rest)};
token_cont(Scn, Acs, W, Rest, SkipToken) ->
    {done, {skip_token, Acs, length_of(Rest)}, pushback(W, Rest)};
token_cont(Scn, Acs, W, Rest, {error, S}) ->
    {done, {error, {user,S}}, pushback(W, Rest)}.

adjust_pos(C, L, [$\n|T]) -> adjust_pos(0, L+1, T);
adjust_pos(C, L, [_|T])   -> adjust_pos(C+1, L, T);
adjust_pos(C, L, [])      -> {C, L}.

length_of(eof)  -> 0;
length_of(List) -> length(List).

pushback(W, eof) ->
    W;
pushback(W, Chars) ->
    lists:foldr(fun(C, Wn) -> cord:walker_push(C, Wn) end,
		W,
		Chars).

yyrev([H|T], Acc) -> yyrev(T, [H|Acc]);
yyrev([], Acc) -> Acc.

yypre([H|T], N) when N > 0 -> [H|yypre(T, N-1)];
yypre(L, N) -> [].

yysuf([H|T], N) when N > 0 -> yysuf(T, N-1);
yysuf(L, 0) -> L.

yysplit(L, N) -> yysplit(L, N, []).
yysplit([H|T], N, Acc) when N > 0 -> yysplit(T, N-1, [H|Acc]);
yysplit(L, 0, Acc)                -> {lists:reverse(Acc), L}.

%%%%%
%% Experimentation with incremental parsing (working)

str0() -> "-bar+ +foo-".
toks0() ->
    {ok, Toks} = test_string(str0()),
    Toks.

%% Hand-hacked version of Toks0 with the space character deleted and
%% its token marked as dirty. When properly re-scanned, this should be
%% the same as toks1 (basis of the test case)
dirty_toks() -> [OK_A,OK_B,OK_C,Changed0|Following0] = toks0(),
		 Changed1 = Changed0#token{dirty=true},
		 Following1 = lists:map(fun(T) ->
						Start = T#token.start,
						Finish = T#token.finish,
						T#token{start=Start-1,
							finish=Finish-1}
					end,
					Following0),
		 [OK_A,OK_B,OK_C,Changed1|Following1].

%% for: "bar++foo"
%% The space has been changed (now length 0) and is marked dirty.
str1() -> str0() -- " ".
toks1() ->
    {ok, Toks} = test_string(str1()),
    Toks.

rescan_test() ->
    Scn = make_test_scanner(),
    Dirty = mark_dirty(dirty_toks()),
    Cord = cord:new(str1()),
    Result = rescan(Scn, Cord, Dirty),
    {Result == toks1(), Result, Dirty}.

%% What should happen with this simple algorithm and test case:
%% 1. We see that a token is dirty.
%% 2. We scan backwards for dependencies (by seeing what has a look-ahead
%%    that reaches the dirty token, or another token that does, etc)
%% 3. From the earliest dependency, we start re-scanning everything
%%    until we scan a token which leaves the state of the lexer the same as
%%    it was previously (i.e. so the following token is known to be
%%    unchanged)
%%
%% So, how is the state of the lexer defined? In leex it seems to me
%% that the internal scanner state is always the same at the start of
%% each token (barring anything magical done in actions - which for
%% now I ignore). So, I think it's safe to assume that a token will be
%% unchanged if both the chars in its lexeme and the ones reached by
%% its look-ahead are the same. For "tricky stuff" it may be necessary
%% to capture the process dictionary and include (some of) it in the
%% state.
%%
%% So I will terminate when I reach a token beyond the changed text
%% which is starting in the same place.
%%
%% NB: We shouldn't need to go *too* far backwards when marking
%% dependencies in languages I can think of, because of common
%% zero-lookahead tokens like: \n , ; ( ) ...etc and most other
%% tokens will just be 1-lookahead.

re_lex(Toks0) ->
    Toks1 = mark_dirty(Toks0).

mark_dirty(Toks) ->
    mark_dirty(Toks, []).

mark_dirty([H|T], Acc) when H#token.dirty == true ->
    mark_dirty1([H|T], H#token.start, Acc);
mark_dirty([H|T], Acc) ->
    mark_dirty(T, [H|Acc]);
mark_dirty([], Acc) ->
    lists:reverse(Acc).

mark_dirty1(OK, DirtyPos, Toks0) ->
    F = fun(Tok, DP) ->
		case (Tok#token.finish-1) + Tok#token.lookahead of
		    P when P >= DP ->
			{Tok#token{dirty=true},
			 Tok#token.start};
		    _ ->
			{Tok, DP}
		end
	end,
    {Toks1, _} = lists:mapfoldl(F, DirtyPos, Toks0),
    lists:reverse(Toks1) ++ OK.

%% Rescan dirty tokens.
rescan(Scn, Cord, []) ->
    [];
rescan(Scn, Cord, [H|T]) when H#token.dirty == false ->
    [H|rescan(Scn, Cord, T)];
rescan(Scn, Cord, [H|T]) when H#token.dirty == true ->
    Pos = H#token.start,
    {_, Region} = cord:split(Cord, Pos-1),
    Walker = cord:walker(Region),
    rescan_dirty(Scn, Walker, Pos, [H|T]).

%% rescan_dirty(Toks)
%%
%% The first token is dirty. Scan until we get back to a sane state
rescan_dirty(Scn, W0, Pos, [Tok|Toks]) ->
    Start = Tok#token.start,
    io:format("(Pos = ~p) Rescanning ~p~n", [Pos, Tok]),
    case token(Scn, W0) of
	{done, eof} ->
	    [];
	{done, Result, W1} ->
	    Token = make_token(Pos, Result),
	    [Token|rescan_dirty_cont(Scn, W1, Token#token.finish, Toks)];
	{error, Reason} ->
	    %% FIXME: should make an error-token and carry on
	    {error, Reason}
    end.

rescan_dirty_cont(Scn, W, Pos, []) ->
    [];
rescan_dirty_cont(Scn, W, Pos, Rest) ->
    Next = hd(Rest),
    if
	%% This token no longer exists!
	Next#token.finish =< Pos ->
	    io:format("(Pos = ~p) Discaring token: ~p~n", [Pos, Next]),
	    rescan_dirty_cont(Scn, W, Pos, tl(Rest));
	%% We need to carry on if the token is known to be dirty, or
	%% if we aren't at the same place that it was scanned from
	%% before
	Next#token.dirty == true;
	Next#token.start /= Pos ->
	    rescan_dirty(Scn, W, Pos, Rest);
	true ->
	    Rest
    end.

make_token(Pos, {ok, T, Acs, LookAhead}) ->
    Type = element(1, T),
    Len = length(Acs),
    #token{type=Type,
	   start=Pos,
	   finish=Pos+Len,
	   lookahead=LookAhead};
make_token(Pos, {skip_token, Acs, LookAhead}) ->
    Len = length(Acs),
    #token{type=em_skipped,
	   start=Pos,
	   finish=Pos+Len,
	   lookahead=LookAhead}.

make_error_token(Pos) ->
    #token{type=em_error,
	   start=Pos,
	   finish=Pos+1,
	   lookahead=1}.