mods/src/em_scan.erl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381

%%%----------------------------------------------------------------------
%%% File    : em_scan.erl
%%% Author  : Luke Gorrie <luke@bluetail.com>
%%% Purpose : Scanner for leex-style DFAs.
%%% Created :  1 May 2001 by Luke Gorrie <luke@bluetail.com>
%%%----------------------------------------------------------------------

-module(em_scan).
-author('luke@bluetail.com').

-record(scanner, {initial_state, dfa_fn, action_fn}).

%% Token record. The actual lexeme, if returned by the scanner, is
%% discarded - all that's taken from the user-written "actions" is the
%% token type. Instead we just keep the start/finish positions in a
%% cord.
%%
%% Not retaining line/column information. Hopefully that's convenient
%% to derive later.
-record(token,
	{
	  %% type chosen by scanner. There is also a special type:
	  %% 'em_skipped', when the scanner returns 'skip_token' but
	  %% we record it anyway.
	  type,
	  %% start position in cord
	  start,
	  %% finish position in cord (point *after* the last character)
	  finish,
	  %% number of characters beyond the end of the lexeme that
	  %% the scanner state machine examined, recorded to track
	  %% dependency
	  lookahead,
	  %% for book keeping, saying whether the token is known to
	  %% need re-scanning
	  dirty=false
	 }).

-compile(export_all).
%%-export([Function/Arity, ...]).

%% Bogus line number to pass to leex/yecc, because I don't want to
%% track lines at this low level. It's not safe to use an atom, but
%% hopefully -42 looks deliberate enough to prompt a grep :-)

-define(line, -42). %% Confusing number appearing where line # should be :-)

make_scanner(State0, DFA, Action) ->
    #scanner{initial_state=State0,
	     dfa_fn=DFA,
	     action_fn=Action}.

make_scheme_scanner() -> make_leex_scanner(em_scheme_scan).
make_erlang_scanner() -> make_leex_scanner(em_erlang_scan).
make_test_scanner() -> make_leex_scanner(em_test_scan).

make_leex_scanner(Mod) ->
    make_scanner(Mod:yystate(),
		 {Mod, yystate},
		 {Mod, yyaction}).

scan_annotation(Scanner, _S0, Cord, RText, Start, End) ->
    Walker = cord:walker(Cord),
    case edit_scan(Scanner, Walker) of
	{ok, Toks} ->
	    {ok, Toks};
	{error, Reason} ->
	    {ok, bad_scan}
    end.

test_test(Str) ->
    case test_string(Str) of
	{ok, Toks} ->
	    {ok, lists:map(fun(T) -> element(1, T) end,
			   Toks)};
	X ->
	    X
    end.

erlang_test(Str) ->
    case erlang_string(Str) of
	{ok, Toks} ->
	    {ok, lists:map(fun(T) -> element(1, T) end,
			   Toks)};
	X ->
	    X
    end.

scheme_test(Str) ->
    case scheme_string(Str) of
	{ok, Toks} ->
	    {ok, lists:map(fun(T) -> element(1, T) end,
			   Toks)};
	X ->
	    X
    end.

test_string(Str) ->
    Cord = list_to_binary(Str),
    Scan = make_test_scanner(),
    edit_scan(Scan, cord:walker(Cord)).

scheme_string(Str) ->
    Cord = list_to_binary(Str),
    Scan = make_scheme_scanner(),
    edit_scan(Scan, cord:walker(Cord)).

erlang_string(Str) ->
    Cord = list_to_binary(Str),
    Scan = make_erlang_scanner(),
    edit_scan(Scan, cord:walker(Cord)).

%% Returns: {ok, [#token]} | {error, Reason}
edit_scan(Scn, Walker) ->
    edit_scan(Scn, Walker, 1, []).

edit_scan(Scn, Walker, Pos) ->
    edit_scan(Scn, Walker, 1, []).

edit_scan(Scn, W0, Pos, Acc) ->
    case token(Scn, W0) of
	{done, eof} ->
	    {ok, lists:reverse(Acc)};
	{done, Result, W1} ->
	    Token = make_token(Pos, Result),
	    edit_scan(Scn, W1, Token#token.finish, [Token|Acc]);
	{error, Reason} ->
	    {error, Reason}
    end.

make_skipped_token(Pos, Len, LookAhead) ->
    #token{type=em_skipped,
	   start=Pos,
	   finish=Pos+Len,
	   lookahead=LookAhead}.

% tokens(Scn, Walker) ->
%     tokens(Scn, Walker, []).

% tokens(Scn, W0, Acc) ->
%     case token(Scn, W0) of
% 	{done, {ok, T, Acs}, W1} ->
% 	    tokens(Scn, W1, [T|Acc]);
% 	{done, {skip_token, Acs}, W1} ->
% 	    tokens(Scn, W1, Acc);
% 	{done, eof} ->
% 	    {ok, lists:reverse(Acc)};
% 	{error, Reason} ->
% 	    {error, Reason}
%     end.

token(Scn, Walker) ->
    State0 = Scn#scanner.initial_state,
    token(Scn, Walker, State0, [], 0, reject, 0).

token(Scn, W0, S0, Tcs0, Tlen0, A0, Alen0) ->
    ActionF = Scn#scanner.action_fn,
    DFA_F  = Scn#scanner.dfa_fn,
    {Ics, Tcs1, W1} = case cord:walker_next(W0) of
			     {done, WEOF} ->
				 {eof, Tcs0, WEOF};
			     {Ch, Wnext} ->
				 {[Ch], [Ch|Tcs0], Wnext}
			 end,
    case DFA_F(S0, Ics, ?line, Tlen0, A0, Alen0) of
	{A1,Alen1,[],L1} ->		% accepting end state
	    TcsFwd = lists:reverse(Tcs1),
	    token_cont(Scn,TcsFwd,W1,[],ActionF(A1,Alen1,TcsFwd,?line));
 	{A1,Alen1,[],L1,S1} ->		% after accepting state
	    token(Scn,W1,S1,Tcs1,Alen1,A1,Alen1);
	{A1,Alen1,Ics1,L1,S1} ->	% accepting state with leftover
                                        % chars.. sounds like a time
                                        % to act.
	    TcsFwd = lists:reverse(Tcs1),
	    Acs = yypre(TcsFwd, Alen1),
	    token_cont(Scn,Acs,W1,Ics1,ActionF(A1,Alen1,TcsFwd,?line));
	{A1,Alen1,Tlen1,[],L1,S1} ->	% after a non-accepting state
	    token(Scn,W1,S1,Tcs1,Tlen1,A1,Alen1);
	{reject,Alen1,Tlen1,eof,L1,S1} ->
	    {done,eof};
	{reject,Alen1,Tlen1,Ics1,L1,S1} ->
	    {done,
	     {error,{illegal,yypre(Tcs1, Tlen1+1)}},
	     Ics1};
	{A1,Alen1,Tlen1,Ics1,L1,S1} ->
	    TcsFwd = lists:reverse(Tcs1),
	    {Acs, Rest} = yysplit(TcsFwd, Alen1),
	    token_cont(Scn,Acs,W1,Rest,ActionF(A1,Alen1,TcsFwd,?line))
    end.

token_cont(Scn, Acs, W, Rest, {token, T}) ->
    {done, {ok, T, Acs, length_of(Rest)}, pushback(W, Rest)};
token_cont(Scn, Acs, W, Rest, {end_token, T}) ->
    {done, {ok, T, Acs, length_of(Rest)}, pushback(W, Rest)};
token_cont(Scn, Acs, W, Rest, SkipToken) ->
    {done, {skip_token, Acs, length_of(Rest)}, pushback(W, Rest)};
token_cont(Scn, Acs, W, Rest, {error, S}) ->
    {done, {error, {user,S}}, pushback(W, Rest)}.

adjust_pos(C, L, [$\n|T]) -> adjust_pos(0, L+1, T);
adjust_pos(C, L, [_|T])   -> adjust_pos(C+1, L, T);
adjust_pos(C, L, [])      -> {C, L}.

length_of(eof)  -> 0;
length_of(List) -> length(List).

pushback(W, eof) ->
    W;
pushback(W, Chars) ->
    lists:foldr(fun(C, Wn) -> cord:walker_push(C, Wn) end,
		W,
		Chars).

yyrev([H|T], Acc) -> yyrev(T, [H|Acc]);
yyrev([], Acc) -> Acc.

yypre([H|T], N) when N > 0 -> [H|yypre(T, N-1)];
yypre(L, N) -> [].

yysuf([H|T], N) when N > 0 -> yysuf(T, N-1);
yysuf(L, 0) -> L.

yysplit(L, N) -> yysplit(L, N, []).
yysplit([H|T], N, Acc) when N > 0 -> yysplit(T, N-1, [H|Acc]);
yysplit(L, 0, Acc)                -> {lists:reverse(Acc), L}.

%%%%%
%% Experimentation with incremental parsing (working)

str0() -> "-bar+ +foo-".
toks0() ->
    {ok, Toks} = test_string(str0()),
    Toks.

%% Hand-hacked version of Toks0 with the space character deleted and
%% its token marked as dirty. When properly re-scanned, this should be
%% the same as toks1 (basis of the test case)
dirty_toks() -> [OK_A,OK_B,OK_C,Changed0|Following0] = toks0(),
		 Changed1 = Changed0#token{dirty=true},
		 Following1 = lists:map(fun(T) ->
						Start = T#token.start,
						Finish = T#token.finish,
						T#token{start=Start-1,
							finish=Finish-1}
					end,
					Following0),
		 [OK_A,OK_B,OK_C,Changed1|Following1].

%% for: "bar++foo"
%% The space has been changed (now length 0) and is marked dirty.
str1() -> str0() -- " ".
toks1() ->
    {ok, Toks} = test_string(str1()),
    Toks.

rescan_test() ->
    Scn = make_test_scanner(),
    Dirty = mark_dirty(dirty_toks()),
    Cord = cord:new(str1()),
    Result = rescan(Scn, Cord, Dirty),
    {Result == toks1(), Result, Dirty}.

%% What should happen with this simple algorithm and test case:
%% 1. We see that a token is dirty.
%% 2. We scan backwards for dependencies (by seeing what has a look-ahead
%%    that reaches the dirty token, or another token that does, etc)
%% 3. From the earliest dependency, we start re-scanning everything
%%    until we scan a token which leaves the state of the lexer the same as
%%    it was previously (i.e. so the following token is known to be
%%    unchanged)
%%
%% So, how is the state of the lexer defined? In leex it seems to me
%% that the internal scanner state is always the same at the start of
%% each token (barring anything magical done in actions - which for
%% now I ignore). So, I think it's safe to assume that a token will be
%% unchanged if both the chars in its lexeme and the ones reached by
%% its look-ahead are the same. For "tricky stuff" it may be necessary
%% to capture the process dictionary and include (some of) it in the
%% state.
%%
%% So I will terminate when I reach a token beyond the changed text
%% which is starting in the same place.
%%
%% NB: We shouldn't need to go *too* far backwards when marking
%% dependencies in languages I can think of, because of common
%% zero-lookahead tokens like: \n , ; ( ) ...etc and most other
%% tokens will just be 1-lookahead.

re_lex(Toks0) ->
    Toks1 = mark_dirty(Toks0).

mark_dirty(Toks) ->
    mark_dirty(Toks, []).

mark_dirty([H|T], Acc) when H#token.dirty == true ->
    mark_dirty1([H|T], H#token.start, Acc);
mark_dirty([H|T], Acc) ->
    mark_dirty(T, [H|Acc]);
mark_dirty([], Acc) ->
    lists:reverse(Acc).

mark_dirty1(OK, DirtyPos, Toks0) ->
    F = fun(Tok, DP) ->
		case (Tok#token.finish-1) + Tok#token.lookahead of
		    P when P >= DP ->
			{Tok#token{dirty=true},
			 Tok#token.start};
		    _ ->
			{Tok, DP}
		end
	end,
    {Toks1, _} = lists:mapfoldl(F, DirtyPos, Toks0),
    lists:reverse(Toks1) ++ OK.

%% Rescan dirty tokens.
rescan(Scn, Cord, []) ->
    [];
rescan(Scn, Cord, [H|T]) when H#token.dirty == false ->
    [H|rescan(Scn, Cord, T)];
rescan(Scn, Cord, [H|T]) when H#token.dirty == true ->
    Pos = H#token.start,
    {_, Region} = cord:split(Cord, Pos-1),
    Walker = cord:walker(Region),
    rescan_dirty(Scn, Walker, Pos, [H|T]).

%% rescan_dirty(Toks)
%%
%% The first token is dirty. Scan until we get back to a sane state
rescan_dirty(Scn, W0, Pos, [Tok|Toks]) ->
    Start = Tok#token.start,
    io:format("(Pos = ~p) Rescanning ~p~n", [Pos, Tok]),
    case token(Scn, W0) of
	{done, eof} ->
	    [];
	{done, Result, W1} ->
	    Token = make_token(Pos, Result),
	    [Token|rescan_dirty_cont(Scn, W1, Token#token.finish, Toks)];
	{error, Reason} ->
	    %% FIXME: should make an error-token and carry on
	    {error, Reason}
    end.

rescan_dirty_cont(Scn, W, Pos, []) ->
    [];
rescan_dirty_cont(Scn, W, Pos, Rest) ->
    Next = hd(Rest),
    if
	%% This token no longer exists!
	Next#token.finish =< Pos ->
	    io:format("(Pos = ~p) Discaring token: ~p~n", [Pos, Next]),
	    rescan_dirty_cont(Scn, W, Pos, tl(Rest));
	%% We need to carry on if the token is known to be dirty, or
	%% if we aren't at the same place that it was scanned from
	%% before
	Next#token.dirty == true;
	Next#token.start /= Pos ->
	    rescan_dirty(Scn, W, Pos, Rest);
	true ->
	    Rest
    end.

make_token(Pos, {ok, T, Acs, LookAhead}) ->
    Type = element(1, T),
    Len = length(Acs),
    #token{type=Type,
	   start=Pos,
	   finish=Pos+Len,
	   lookahead=LookAhead};
make_token(Pos, {skip_token, Acs, LookAhead}) ->
    Len = length(Acs),
    #token{type=em_skipped,
	   start=Pos,
	   finish=Pos+Len,
	   lookahead=LookAhead}.

make_error_token(Pos) ->
    #token{type=em_error,
	   start=Pos,
	   finish=Pos+1,
	   lookahead=1}.