aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/cord.erl
diff options
context:
space:
mode:
Diffstat (limited to 'src/cord.erl')
-rw-r--r--src/cord.erl504
1 files changed, 504 insertions, 0 deletions
diff --git a/src/cord.erl b/src/cord.erl
new file mode 100644
index 0000000..980248d
--- /dev/null
+++ b/src/cord.erl
@@ -0,0 +1,504 @@
+%%%----------------------------------------------------------------------
+%%% File : cord.erl
+%%% Author : Luke Gorrie <luke@bluetail.com>
+%%% Purpose : Data structure for large strings of text
+%%% Created : 21 Oct 2000 by Luke Gorrie <luke@bluetail.com>
+%%%----------------------------------------------------------------------
+%%
+%% Cords - a scalable data structure for strings of text.
+%%
+%% Cords are binary trees with erlang binaries as leaves. The trees
+%% are kept fairly balanced, and the sizes of the binary objects are
+%% kept within acceptable bounds. By using binaries, it should only
+%% cost about one byte per character.
+%%
+%% The main idea is to have a fast insert/replace operation which
+%% doesn't change the binaries too much. By keeping the binaries on
+%% the leaves fairly small, we generally get to move them about in one
+%% piece while we're balancing things after large updates or pulling
+%% out large regions. An added advantage of not disturbing the
+%% binaries is that it's not too expensive to keep old copies of cords
+%% around, since most of the binaries get shared on the heap.
+%%
+%% API:
+%% new():
+%% Creates a new, empty cord.
+%% new(ListOrBinary):
+%% Creates a new cord from some input characters.
+%% replace(Cord, NewText, Start, Length):
+%% Replaces the `Length' long portion of `Cord' starting at `Start'
+%% with `NewText' (a cord, binary, or iolist).
+%% join(Left, Right)
+%% Join two cords together into a new one.
+%% to_binary(Cord): Convert a cord into a binary
+%% to_list(Cord): Convert a cord into a list
+%% walk(Cord, Pos, Direction, Fun):
+%% "Walk" character by character along `Cord' in `Direction'. For each
+%% character, we call Fun(Char) which returns either {result, R}, or
+%% {more, NextFun}. If we run out of characters before the fun returns
+%% a result, we call Fun(finish), which is required to return
+%% {result, R}.
+
+-module(cord).
+-author('luke@bluetail.com').
+
+-compile(export_all).
+
+-export([new/0, new/1, cord_size/1,
+ replace/4, region/3, region_binary/3, region_list/3,
+ to_binary/1, to_list/1, to_iolist/1,
+ walk/4]).
+
+%% It seems that static values won't do for scaling to gigantic (tens
+%% of megabytes) sizes.
+-define(MIN_SIZE, 512).
+-define(MAX_SIZE, 2048).
+
+%% A cord is either a #cord record or a binary.
+-record(cord, {size, % Combined size
+ left, % cord()
+ right, % cord()
+ %% The dirty flag indicates that a cord (or one of its
+ %% children) has been changed since the last time the
+ %% cord was 'fixed'
+ dirty=false
+ }).
+
+-define(assert(X),
+ (case X of
+ true ->
+ true;
+ false ->
+ exit(lists:flatten(io_lib:format("Assertion failed at ~p:~p",
+ [?MODULE, ?LINE])))
+ end)).
+
+new() -> <<>>.
+
+new(B) when binary(B) -> fix_cord(B);
+new(L) when list(L) -> new(list_to_binary(L)).
+
+%% More efficient way to create a cord from a file. This
+%% implementation is not terribly clever (read small chunks, balance
+%% at the end), but at least it isn't a memory hog.
+%%
+%% Returns: {ok, cord()} | {error, Reason}
+new_from_file(Filename) ->
+ case file:open(Filename, [raw, binary]) of
+ X = {error, Rsn} ->
+ X;
+ {ok, F} ->
+ read_chunks(F)
+ end.
+
+read_chunks(F) ->
+ read_chunks(F, new()).
+
+read_chunks(F, Acc) ->
+ case file:read(F, ?MAX_SIZE) of
+ eof ->
+ {ok, fix_cord(Acc)};
+ {ok, Bin} ->
+ read_chunks(F, make_cord(Acc, Bin));
+ X = {error, Reason} ->
+ X
+ end.
+
+%% Make a cord.
+%% This is a "dirty" operation that doesn't rebalance the tree.
+make_cord(Left, Right) ->
+ #cord{size=cord_size(Left) + cord_size(Right),
+ left=Left,
+ right=Right,
+ dirty=true}.
+
+cord_size(Cord) when binary(Cord) ->
+ size(Cord);
+cord_size(Cord) when record(Cord, cord) ->
+ Cord#cord.size.
+
+max_depth(Cord) when binary(Cord) ->
+ 1;
+max_depth(Cord) ->
+ 1 + max(max_depth(Cord#cord.left),
+ max_depth(Cord#cord.right)).
+
+nr_nodes(Cord) when binary(Cord) ->
+ 1;
+nr_nodes(Cord) ->
+ 1 + nr_nodes(Cord#cord.left) + nr_nodes(Cord#cord.right).
+
+mean_leaf_size(Cord) ->
+ Sizes = leaf_sizes(Cord),
+ Sum = lists:foldr(fun(X, Acc) -> X + Acc end,
+ 0,
+ Sizes),
+ round(Sum / length(Sizes)).
+
+leaf_sizes(Cord) when binary(Cord) ->
+ [size(Cord)];
+leaf_sizes(Cord) ->
+ leaf_sizes(Cord#cord.left) ++ leaf_sizes(Cord#cord.right).
+
+max(X, Y) when X > Y -> X;
+max(X, Y) -> Y.
+
+insert(Cord, New, Point) ->
+ replace(Cord, New, Point, 0).
+
+delete(Cord, Point, Length) ->
+ replace(Cord, [], Point, Length).
+
+%% replace/4: Replace a region of the cord. `New' is the text to
+%% replace the region with, and can be either a list, binary, or cord.
+replace(Cord, New, Start, Length) when list(New) ->
+ replace(Cord, list_to_binary(New), Start, Length);
+replace(Cord, New, Start, Length) ->
+ %% Replace is done by copying the areas on the left and right of
+ %% the region, and joining them together with the new cord in the
+ %% middle.
+ {A, B} = split(Cord, Start-1),
+ {C, D} = split(B, Length),
+ fix_cord(make_cord(make_cord(A, New), D)).
+
+split(Cord, 0) when binary(Cord) ->
+ {<<>>, Cord};
+split(Cord, Pos) when binary(Cord) ->
+ ?assert(Pos =< cord_size(Cord)),
+ <<Left:Pos/binary, Right/binary>> = Cord,
+ {Left, Right};
+
+split(Cord, Pos) when record(Cord, cord) ->
+ ?assert(Pos =< cord_size(Cord)),
+ LeftSz = cord_size(Cord#cord.left),
+ RightSz = cord_size(Cord#cord.right),
+ %%io:format("Split - left:~p right:~p~n", [LeftSz, RightSz]),
+ if LeftSz == Pos ->
+ {Cord#cord.left, Cord#cord.right};
+ LeftSz > Pos ->
+ {SplitLeft, SplitRight} = split(Cord#cord.left, Pos),
+ {SplitLeft, make_cord(SplitRight, Cord#cord.right)};
+ LeftSz < Pos ->
+ {SplitLeft, SplitRight} = split(Cord#cord.right, Pos-LeftSz),
+ {make_cord(Cord#cord.left, SplitLeft), SplitRight}
+ end.
+
+%% join two cords together and rebalance.
+join(Left, Right) when binary(Left) ->
+ fix_cord(make_cord(Left, Right)).
+
+%% fix_cord/1
+%%
+%% "Fix" a cord so that it's reasonably balanced, and it's leaves are
+%% reasonable sizes.
+
+%% Leaf (binary) - break it up if it's too big
+fix_cord(Bin) when binary(Bin) ->
+ if size(Bin) > ?MAX_SIZE ->
+ {Left, Right} = split(Bin, round(size(Bin) / 2)),
+ fix_cord(make_cord(Left, Right));
+ true ->
+ Bin
+ end;
+fix_cord(Cord) when Cord#cord.dirty == false ->
+ Cord;
+%% Branch (cord) - merge its children if they're too small, balance it
+%% if it's too unbalanced.
+fix_cord(Cord) when record(Cord, cord) ->
+ Sz = cord_size(Cord),
+ Left = Cord#cord.left,
+ Right = Cord#cord.right,
+ LeftSz = cord_size(Left),
+ RightSz = cord_size(Right),
+ SzDiff = abs(LeftSz - RightSz),
+ if Sz < ?MIN_SIZE ->
+ %% Too small - make it into a binary
+ to_binary(Cord);
+ SzDiff > (Sz/3) ->
+ %% needs rebalancing
+ if LeftSz > RightSz ->
+ if binary(Left) ->
+ fix_cord(to_binary(Cord));
+ true ->
+ balance_from_left(Cord)
+ end;
+ LeftSz =< RightSz ->
+ if binary(Right) ->
+ fix_cord(to_binary(Cord));
+ true ->
+ balance_from_right(Cord)
+ end
+ end;
+ true ->
+ %% this cord is ok, fix the children
+ Cord#cord{left=fix_cord(Left),
+ right=fix_cord(Right),
+ dirty=false}
+ end.
+
+%% Balance by taking from the left side.
+%% Left must be a #cord, right can be a binary.
+balance_from_left(#cord{left=Left, right=Right}) when record(Left, cord) ->
+ LLSz = cord_size(Left#cord.left),
+ LRSz = cord_size(Left#cord.right),
+ if
+ LRSz < LLSz ->
+ %% single rotate
+ fix_cord(make_cord(Left#cord.left,
+ make_cord(Left#cord.right,
+ Right)));
+ record(Left#cord.right, cord) ->
+ %% double rotate
+ LeftRight = Left#cord.right,
+ fix_cord(make_cord(make_cord(Left#cord.left,
+ LeftRight#cord.left),
+ make_cord(LeftRight#cord.right,
+ Right)));
+ true ->
+ fix_cord(to_binary(make_cord(Left, Right)))
+ end.
+
+%% oh, pain, duplication. never have been good at taking redundancy
+%% out of symmetric functions. -luke
+balance_from_right(#cord{left=Left, right=Right}) when record(Right, cord) ->
+ RLSz = cord_size(Right#cord.left),
+ RRSz = cord_size(Right#cord.right),
+ if
+ RLSz < RRSz ->
+ %% single rotate
+ fix_cord(make_cord(make_cord(Left,
+ Right#cord.left),
+ Right#cord.right));
+ record(Right#cord.left, cord) ->
+ %% double rotate
+ RightLeft = Right#cord.left,
+ fix_cord(make_cord(make_cord(Left,
+ RightLeft#cord.left),
+ make_cord(RightLeft#cord.right,
+ Right#cord.right)));
+ true ->
+ fix_cord(to_binary(make_cord(Left, Right)))
+ end.
+
+%% Return: cord()
+region(Cord, Start, Length) ->
+ {A, B} = split(Cord, Start-1),
+ {C, D} = split(B, Length),
+ C.
+
+%% Return: binary()
+region_binary(Cord, Start, Length) ->
+ to_binary(region(Cord, Start, Length)).
+
+%% Return: list()
+region_list(Cord, Start, Length) ->
+ binary_to_list(region_binary(Cord, Start, Length)).
+
+to_binary(Cord) when binary(Cord) ->
+ Cord;
+to_binary(Cord) ->
+ list_to_binary(to_binary1(Cord)).
+
+to_binary1(Cord) when binary(Cord) ->
+ Cord;
+to_binary1(Cord) when record(Cord, cord) ->
+ [to_binary1(Cord#cord.left),to_binary1(Cord#cord.right)].
+
+to_list(Cord) ->
+ binary_to_list(to_binary(Cord)).
+
+to_iolist(Cord) when binary(Cord) ->
+ [Cord];
+to_iolist(Cord) ->
+ [to_iolist(Cord#cord.left) | to_iolist(Cord#cord.right)].
+
+%% Walk backwards along a cord, character by character.
+%% F = fun(X) -> {more, F2} | {result, R}
+%% X = char() | finish
+walk(Cord, Pos, Direction, F) ->
+ %% Make this simple: extract the region we want to walk along.
+ Region = case Direction of
+ backward ->
+ {A, B} = split(Cord, Pos),
+ A;
+ forward ->
+ {A, B} = split(Cord, Pos-1),
+ B
+ end,
+ case walk1(Region, Direction, F) of
+ {result, R} ->
+ R;
+ {more, FNext} ->
+ {result, R} = FNext(finish),
+ R
+ end.
+
+walk1(<<>>, Direction, F) ->
+ {more, F};
+walk1(Bin, Direction, F) when binary(Bin) ->
+ {Chunk, Char} = case Direction of
+ backward ->
+ Sz = size(Bin) - 1,
+ <<Front:Sz/binary, Back>> = Bin,
+ {Front, Back};
+ forward ->
+ <<Front, Back/binary>> = Bin,
+ {Back, Front}
+ end,
+ case F(Char) of
+ {more, F2} ->
+ walk1(Chunk, Direction, F2);
+ {result, R} ->
+ {result, R}
+ end;
+walk1(Cord, Direction, F) when record(Cord, cord) ->
+ {First, Second} = case Direction of
+ backward ->
+ {Cord#cord.right, Cord#cord.left};
+ forward ->
+ {Cord#cord.left, Cord#cord.right}
+ end,
+ case walk1(First, Direction, F) of
+ {more, F2} ->
+ walk1(Second, Direction, F2);
+ {result, R} ->
+ {result, R}
+ end.
+
+walker(Cord) ->
+ walker(Cord, forward).
+
+walker(Cord, Direction) ->
+ walker(Cord, Direction, <<>>).
+
+walker(Cord, Direction, More) when binary(Cord) ->
+ {Cord, Direction, More};
+walker(Cord, forward, More) ->
+ walker(Cord#cord.left, forward, make_cord(Cord#cord.right, More));
+walker(Cord, backward, More) ->
+ walker(Cord#cord.right, backward, make_cord(More, Cord#cord.left)).
+
+walker_at_end({walked_to_end, _}) ->
+ true;
+walker_at_end({C,_,Rest}) ->
+ cord_size(C) + cord_size(Rest) == 0.
+
+walker_direction({walked_to_end, Direction}) -> Direction;
+walker_direction({_, Direction, _}) -> Direction.
+
+walker_next({walked_to_end, Direction}) ->
+ {done, {walked_to_end, Direction}};
+walker_next({<<>>, Direction, More}) ->
+ case cord_size(More) of
+ 0 ->
+ {done, {walked_to_end, Direction}};
+ _ ->
+ walker_next(walker(More, Direction))
+ end;
+walker_next({<<A, Chunk/binary>>, forward, More}) ->
+ {A, {Chunk, forward, More}};
+walker_next({Bin, backward, More}) ->
+ ChunkSz = size(Bin) - 1,
+ <<Chunk:ChunkSz/binary, A>> = Bin,
+ {A, {Chunk, backward, More}}.
+
+walker_push(done, Walker) ->
+ Walker;
+walker_push(X, {walked_to_end, Dir}) ->
+ {<<X>>, Dir, <<>>};
+walker_push(X, {Bin, forward, More}) ->
+ {<<X>>, forward, make_cord(Bin, More)};
+walker_push(X, {Bin, backward, More}) ->
+ {<<X>>, backward, make_cord(Bin, More)}.
+
+walker_test() ->
+ Cord = make_cord(make_cord(<<1,2>>, <<3>>),
+ make_cord(<<4,5,6>>, <<7,8,9>>)),
+ W = walker(Cord, forward),
+ walker_test_loop(W).
+
+walker_test_loop(W) ->
+ case walker_next(W) of
+ {done, _} ->
+ [];
+ {Ch, W2} ->
+ [Ch|walker_test_loop(W2)]
+ end.
+
+test() ->
+ %% Test of binary cords
+ BinCord = <<1, 2, 3, 4, 5>> ,
+ {BinCord, <<>>} = split(BinCord, size(BinCord)),
+ {<<>>, BinCord} = split(BinCord, 0),
+ {<<1, 2>>, <<3, 4, 5>>} = split(BinCord, 2),
+ %% Test of a simple cord
+ Cord1 = make_cord(<<1, 2>>, <<3, 4, 5>>),
+ {<<>>, <<1, 2, 3, 4, 5>>} = binsplit(Cord1, 0),
+ {<<1, 2>>, <<3, 4, 5>>} = binsplit(Cord1, 2),
+ {<<1, 2, 3>>, <<4, 5>>} = binsplit(Cord1, 3),
+ %% A less trivial cord
+ %% (spaces before commas are to workaround an erlang-mode indent problem)
+ A = <<1, 2>> ,
+ B = <<3, 4>> ,
+ C = <<5, 6>> ,
+ D = <<7, 8>> ,
+ E = <<9>> ,
+ Cord2 = make_cord(make_cord(A, B), make_cord(C, make_cord(D, E))),
+ {<<>>, <<1, 2, 3, 4, 5, 6, 7, 8, 9>>} = binsplit(Cord2, 0),
+ {<<1, 2, 3, 4, 5>>, <<6, 7, 8, 9>>} = binsplit(Cord2, 5),
+ %% Joining
+ BinCord2 = to_binary(Cord2),
+ %% Why does = fail but == work?
+ true = <<BinCord/binary, BinCord2/binary>> == binjoin(BinCord, Cord2),
+ true = <<BinCord2/binary, BinCord/binary>> == binjoin(Cord2, Cord1),
+ true = <<BinCord2/binary, BinCord2/binary>> == binjoin(Cord2, Cord2),
+ %% Test "walking"
+ [1,2,3,4,5] = walk(Cord2, 5, backward, walk_test([])),
+ [9, 8, 7, 6, 5] = walk(Cord2, 5, forward, walk_test([])),
+ %% Test some operations
+ <<1, 2, 3, 4>> = delete(Cord2, 5, 5),
+ <<1, 2, 3,
+ 1, 2, 3, 4, 5, 6, 7, 8, 9,
+ 4, 5, 6, 7, 8, 9>> = cord:insert(Cord2, Cord2, 4),
+ ok.
+
+walk_test(Acc) ->
+ fun(finish) ->
+ {result, Acc};
+ (X) ->
+ {more, walk_test([X|Acc])}
+ end.
+
+test2() ->
+ A = <<1, 2, 3, 4>> ,
+ B = <<5, 6>> ,
+ C = <<>> ,
+ D = <<7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17>> ,
+ join(A, join(B, join(C, D))).
+
+binsplit(Cord, Pos) ->
+ {Left, Right} = split(Cord, Pos),
+ {to_binary(Left), to_binary(Right)}.
+
+binjoin(Left, Right) ->
+ to_binary(make_cord(Left, Right)).
+
+benchmark(File) ->
+ {ok, Cord} = cord:new_from_file(File),
+ Sz = cord_size(Cord),
+ random:seed(),
+ Randoms = [random:uniform(Sz) || _ <- lists:seq(1, 100)],
+ timer:tc(?MODULE, split_with_each, [Cord, Randoms, 10]).
+
+split_with_each(Cord, L, N) ->
+ split_with_each(Cord, L, L, N).
+
+split_with_each(Cord, _, _, 0) ->
+ ok;
+split_with_each(Cord, [H|T], L, N) ->
+ split(Cord, H),
+ split_with_each(Cord, T, L, N);
+split_with_each(Cord, [], L, N) ->
+ split_with_each(Cord, L, L, N-1).
+
+