aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/cord_regexp.erl
diff options
context:
space:
mode:
authorlukeg <lukeg>2003-02-21 19:01:14 +0000
committerlukeg <lukeg>2003-02-21 19:01:14 +0000
commite7d48fe500f6ed676ee1b212ebd61408bced1c5b (patch)
tree11a756c7bb4906f3e186c1cb8331cb7ed27bc69c /src/cord_regexp.erl
downloadermacs-fork-e7d48fe500f6ed676ee1b212ebd61408bced1c5b.tar.gz
*** empty log message ***
Diffstat (limited to 'src/cord_regexp.erl')
-rw-r--r--src/cord_regexp.erl296
1 files changed, 296 insertions, 0 deletions
diff --git a/src/cord_regexp.erl b/src/cord_regexp.erl
new file mode 100644
index 0000000..1d6bb4d
--- /dev/null
+++ b/src/cord_regexp.erl
@@ -0,0 +1,296 @@
+%%%----------------------------------------------------------------------
+%%% File : cord_regexp.erl
+%%% Author : Luke Gorrie <luke@bluetail.com>
+%%% Purpose : Regexp ops on cords
+%%% Created : 10 Mar 2001 by Luke Gorrie <luke@bluetail.com>
+%%%----------------------------------------------------------------------
+
+-module(cord_regexp).
+-author('luke@bluetail.com').
+
+-compile(export_all).
+%%-export([Function/Arity, ...]).
+
+first_match(RE, Cord) ->
+ first_match(RE, Cord, 1).
+
+first_match(RE, Cord, Pos) ->
+ first_match(RE, Cord, Pos, forward).
+
+%% Find the first match of RE in Cord starting from Pos.
+first_match(RE, Cord, Pos, Dir) ->
+ case first_match_c(RE, Cord, Pos, Dir) of
+ {{match, Start, Len}, WalkerOut} ->
+ {match, Start, Len};
+ X ->
+ X
+ end.
+
+first_match_c(RE, Cord) ->
+ first_match_c(RE, Cord, 1).
+
+first_match_c(RE, Cord, Pos) ->
+ first_match_c(RE, Cord, Pos, forward).
+
+first_match_c(RE, Cord, Pos, forward) ->
+ {_, Region} = cord:split(Cord, Pos-1),
+ first_match1(RE, cord:walker(Region, forward), Pos);
+
+first_match_c(RE, Cord, Pos, backward) ->
+ case regexp:parse(RE) of
+ {ok, REP} ->
+ {Region,_} = cord:split(Cord, Pos),
+ RREP = reverse(REP),
+ case first_match1(RREP, cord:walker(Region, backward), Pos) of
+ {{match, S, L}, C} ->
+ {{match, S, L}, C};
+ X ->
+ X
+ end;
+ {error, Rsn} ->
+ {error, Rsn}
+ end.
+
+continue_match(RE, {W, N}) ->
+ case first_match1(RE, W, N) of
+ {{match, S, L}, C} ->
+ {{match, S, L}, {C, S+L}};
+ X ->
+ X
+ end.
+
+first_match1(RE, W, Pos) when list(RE) ->
+ case regexp:parse(RE) of
+ {ok, REP} ->
+ first_match1(optimise(REP), W, Pos);
+ X = {error, Rsn} ->
+ X
+ end;
+first_match1(RE, Walker, Pos) ->
+ case cord:walker_at_end(Walker) of
+ true ->
+ nomatch;
+ false ->
+ Start = case cord:walker_direction(Walker) of
+ forward -> 1;
+ backward -> Pos
+ end,
+ case apply_regexp(RE, Start, Walker) of
+ nomatch ->
+ %% it would be more efficient to use Walker to get
+ %% a new continuation to recurse with
+ {_, W1} = cord:walker_next(Walker),
+ first_match1(RE, W1, advance(Pos, W1));
+ {match, EndPos, WalkerOut} ->
+ Len = case cord:walker_direction(Walker) of
+ forward -> EndPos;
+ backward -> Pos - EndPos - 1
+ end,
+ {{match, Pos, Len}, WalkerOut}
+ end
+ end.
+
+apply_regexp(RE, Pos, W) ->
+ {Ch, W1} = cord:walker_next(W),
+ re_apply(RE, [], Ch, Pos, W1).
+
+%% re_apply(RE, More, ThisChar, Pos, InputCont) => {match, Len} | nomatch
+
+%% FIXME: Handling of bos (^) and eos ($) need thinking about when it
+%% comes to backwards-searching. Right now, ^ will match the end when
+%% going backwards.
+
+re_apply(epsilon, More, Ch, P, C) ->
+ re_apply_more(More, P, push(Ch, C));
+re_apply(eos, More, done, P, C) ->
+ case cord:walker_direction(C) of
+ forward ->
+ re_apply_more(More, P, C);
+ backward ->
+ nomatch
+ end;
+re_apply(eos, More, $\n, P, C) ->
+ re_apply_more(More, P, push($\n, C)); % \n isn't consumed
+re_apply(eos, More, done, P, C) ->
+ case cord:walker_direction(C) of
+ forward -> re_apply_more(More, P, C);
+ backward -> nomatch
+ end;
+re_apply(bos, More, done, P, C) ->
+ case cord:walker_direction(C) of
+ forward -> nomatch;
+ backward -> re_apply_more(More, P, C)
+ end;
+re_apply(eos, _, done, _, _) ->
+ true;
+re_apply(eos, _, done, _, _) ->
+ true;
+re_apply({'or', RE1, RE2}, More, Ch, P, C) ->
+ re_apply_or({apply, RE1, More, Ch, P, C},
+ {apply, RE2, More, Ch, P, C});
+re_apply({concat, RE1, RE2}, More, Ch, P, C) ->
+ re_apply(RE1, [RE2|More], Ch, P, C);
+re_apply({kclosure, CE}, More, Ch, P, C) ->
+ re_apply_or({apply_more, More, P, push(Ch, C)},
+ {apply, CE, [{kclosure, CE}|More], Ch, P, C});
+re_apply({pclosure, CE}, More, Ch, P, C) ->
+ re_apply(CE, [{kclosure, CE}|More], Ch, P, C);
+re_apply({optional, CE}, More, Ch, P, C) ->
+ re_apply_or({apply_more, More, P, push(Ch, C)},
+ {apply, CE, More, Ch, P, C});
+re_apply(bos, More, Ch, 1, C) ->
+ case cord:walker_direction(C) of
+ forward -> re_apply_more(More, 1, push(Ch, C));
+ backward -> nomatch
+ end;
+re_apply(bos, More, $\n, P, C) ->
+ re_apply_more(More, 1, push($\n, C)); % \n isn't consumed
+re_apply({char_class, Cc}, More, Ch, P, C) ->
+ case in_char_class(Ch, Cc) of
+ true -> re_apply_more(More, advance(P, C), C);
+ false -> nomatch
+ end;
+re_apply({comp_class, Cc}, More, Ch, P, C) ->
+ case in_char_class(Ch, Cc) of
+ true -> nomatch;
+ false -> re_apply_more(More, advance(P, C), C)
+ end;
+re_apply(Ch, More, Ch, P, C) when integer(Ch) ->
+ re_apply_more(More, advance(P, C), C);
+re_apply(_, _, _, _, _) ->
+ nomatch.
+
+advance(Pos, Walker) ->
+ case cord:walker_direction(Walker) of
+ forward -> Pos + 1;
+ backward -> Pos - 1
+ end.
+
+re_apply_more([H|T], P, C) ->
+ {Next, C1} = cord:walker_next(C),
+ re_apply(H, T, Next, P, C1);
+re_apply_more([], P, C) ->
+ %% -1 isn't used in 'regexp' module, not sure what's different..
+ {match, P-1, C}.
+
+re_apply_or(A, B) ->
+ case re_apply_or_operand(A) of
+ nomatch ->
+ re_apply_or_operand(B);
+ X ->
+ X
+ end.
+
+re_apply_or_operand({apply_more, More, P, C}) ->
+ re_apply_more(More, P, C);
+re_apply_or_operand({apply, RE, More, Ch, P, C}) ->
+ re_apply(RE, More, Ch, P, C).
+
+
+% re_apply_or({match,P1}, {match,P2}) when P1 >= P2 -> {match,P1};
+% re_apply_or({match,P1}, {match,P2}) -> {match,P2};
+% re_apply_or(nomatch, R2) -> R2;
+% re_apply_or(R1, nomatch) -> R1.
+
+in_char_class(C, [{C1,C2}|Cc]) when C >= C1, C =< C2 -> true;
+in_char_class(C, [C|Cc]) -> true;
+in_char_class(C, [_|Cc]) -> in_char_class(C, Cc);
+in_char_class(C, []) -> false.
+
+push(Ch, Cont) ->
+ cord:walker_push(Ch, Cont).
+
+%% reverse(RE)
+%% Returns an equivalent regexp which takes its input in reverse order.
+%% RE should come from regexp:parse/1
+reverse({'or', A, B}) ->
+ {'or', reverse(B), reverse(A)};
+reverse({concat, A, B}) ->
+ {concat, reverse(B), reverse(A)};
+reverse({optional, RE}) ->
+ {optional, reverse(RE)};
+reverse({kclosure, RE}) ->
+ {kclosure, reverse(RE)};
+reverse({pclosure, RE}) ->
+ {pclosure, reverse(RE)};
+reverse(Other) ->
+ Other.
+
+%% Just trivial stuff for starters.
+%%
+%% Rearrange concat
+optimise({concat, {concat, A, B}, C}) ->
+ optimise({concat, A, {concat, B, C}});
+%% recurse
+optimise({'or', A, B}) ->
+ {'or', optimise(A), optimise(B)};
+optimise({kclosure, A}) ->
+ {kclosure, optimise(A)};
+optimise({pclosure, A}) ->
+ {pclosure, optimise(A)};
+optimise({optional, A}) ->
+ {optional, optimise(A)};
+optimise(X) ->
+ X.
+
+test_exprs() ->
+ ["foo",
+ "a|b",
+ "(foo)|(bar)",
+ "a+"].
+
+test_inputs() ->
+ ["foo", "a", "b", "c", "foo", "bar", "baz", "aaaaa"].
+
+test() ->
+ case [{Inp, RE} || RE <- test_exprs(), Inp <- test_inputs(),
+ not same(Inp, RE)] of
+ [] ->
+ ok;
+ %% BadOnes is where the match result was different. BUT!
+ %% sometimes that is OK. At time of writing, the code seems
+ %% correct but has some BadOnes.
+ BadOnes ->
+ lists:foreach(fun complain/1, BadOnes)
+ end.
+
+same(Inp, REStr) ->
+ forwards(Inp, REStr) == backwards(Inp, REStr).
+
+forwards(Inp, RE) ->
+ regexp:match(Inp, RE).
+
+backwards(Inp, REStr) ->
+ {ok, RE} = regexp:parse(REStr),
+ BackwardsResult = regexp:match(lists:reverse(Inp), reverse(RE)).
+
+complain({Inp, REStr}) ->
+ {ok, RE} = regexp:parse(REStr),
+ ForwardsResult = regexp:match(Inp, RE),
+ BackwardsResult = regexp:match(lists:reverse(Inp), reverse(RE)),
+ io:format("match(~p, ~p):~n Fwds = ~p~n Bwds = ~p~n",
+ [Inp, REStr, ForwardsResult, BackwardsResult]).
+
+%% Compare speed of this module on a cord to the speed of the 'regexp'
+%% module on a string.
+bench(Regexp, Cord) ->
+ List = cord:to_list(Cord),
+ {CordSpeed,_} = timer:tc(?MODULE, first_match, [Regexp, Cord]),
+ {ListSpeed, _} = timer:tc(regexp, first_match, [List, Regexp]),
+ io:format("Cord takes ~p% time of List (~pms vs ~pms).~n",
+ [round(CordSpeed * 100 / ListSpeed),
+ round(CordSpeed/1000),
+ round(ListSpeed/1000)]).
+
+escape([H|T]) ->
+ case lists:member(H, specials()) of
+ true ->
+ [$\\,H|escape(T)];
+ false ->
+ [H|escape(T)]
+ end;
+escape([]) ->
+ [].
+
+specials() -> "()^$[]*+?.\\|".
+