diff options
Diffstat (limited to 'src/cord.erl')
-rw-r--r-- | src/cord.erl | 504 |
1 files changed, 504 insertions, 0 deletions
diff --git a/src/cord.erl b/src/cord.erl new file mode 100644 index 0000000..980248d --- /dev/null +++ b/src/cord.erl @@ -0,0 +1,504 @@ +%%%---------------------------------------------------------------------- +%%% File : cord.erl +%%% Author : Luke Gorrie <luke@bluetail.com> +%%% Purpose : Data structure for large strings of text +%%% Created : 21 Oct 2000 by Luke Gorrie <luke@bluetail.com> +%%%---------------------------------------------------------------------- +%% +%% Cords - a scalable data structure for strings of text. +%% +%% Cords are binary trees with erlang binaries as leaves. The trees +%% are kept fairly balanced, and the sizes of the binary objects are +%% kept within acceptable bounds. By using binaries, it should only +%% cost about one byte per character. +%% +%% The main idea is to have a fast insert/replace operation which +%% doesn't change the binaries too much. By keeping the binaries on +%% the leaves fairly small, we generally get to move them about in one +%% piece while we're balancing things after large updates or pulling +%% out large regions. An added advantage of not disturbing the +%% binaries is that it's not too expensive to keep old copies of cords +%% around, since most of the binaries get shared on the heap. +%% +%% API: +%% new(): +%% Creates a new, empty cord. +%% new(ListOrBinary): +%% Creates a new cord from some input characters. +%% replace(Cord, NewText, Start, Length): +%% Replaces the `Length' long portion of `Cord' starting at `Start' +%% with `NewText' (a cord, binary, or iolist). +%% join(Left, Right) +%% Join two cords together into a new one. +%% to_binary(Cord): Convert a cord into a binary +%% to_list(Cord): Convert a cord into a list +%% walk(Cord, Pos, Direction, Fun): +%% "Walk" character by character along `Cord' in `Direction'. For each +%% character, we call Fun(Char) which returns either {result, R}, or +%% {more, NextFun}. If we run out of characters before the fun returns +%% a result, we call Fun(finish), which is required to return +%% {result, R}. + +-module(cord). +-author('luke@bluetail.com'). + +-compile(export_all). + +-export([new/0, new/1, cord_size/1, + replace/4, region/3, region_binary/3, region_list/3, + to_binary/1, to_list/1, to_iolist/1, + walk/4]). + +%% It seems that static values won't do for scaling to gigantic (tens +%% of megabytes) sizes. +-define(MIN_SIZE, 512). +-define(MAX_SIZE, 2048). + +%% A cord is either a #cord record or a binary. +-record(cord, {size, % Combined size + left, % cord() + right, % cord() + %% The dirty flag indicates that a cord (or one of its + %% children) has been changed since the last time the + %% cord was 'fixed' + dirty=false + }). + +-define(assert(X), + (case X of + true -> + true; + false -> + exit(lists:flatten(io_lib:format("Assertion failed at ~p:~p", + [?MODULE, ?LINE]))) + end)). + +new() -> <<>>. + +new(B) when binary(B) -> fix_cord(B); +new(L) when list(L) -> new(list_to_binary(L)). + +%% More efficient way to create a cord from a file. This +%% implementation is not terribly clever (read small chunks, balance +%% at the end), but at least it isn't a memory hog. +%% +%% Returns: {ok, cord()} | {error, Reason} +new_from_file(Filename) -> + case file:open(Filename, [raw, binary]) of + X = {error, Rsn} -> + X; + {ok, F} -> + read_chunks(F) + end. + +read_chunks(F) -> + read_chunks(F, new()). + +read_chunks(F, Acc) -> + case file:read(F, ?MAX_SIZE) of + eof -> + {ok, fix_cord(Acc)}; + {ok, Bin} -> + read_chunks(F, make_cord(Acc, Bin)); + X = {error, Reason} -> + X + end. + +%% Make a cord. +%% This is a "dirty" operation that doesn't rebalance the tree. +make_cord(Left, Right) -> + #cord{size=cord_size(Left) + cord_size(Right), + left=Left, + right=Right, + dirty=true}. + +cord_size(Cord) when binary(Cord) -> + size(Cord); +cord_size(Cord) when record(Cord, cord) -> + Cord#cord.size. + +max_depth(Cord) when binary(Cord) -> + 1; +max_depth(Cord) -> + 1 + max(max_depth(Cord#cord.left), + max_depth(Cord#cord.right)). + +nr_nodes(Cord) when binary(Cord) -> + 1; +nr_nodes(Cord) -> + 1 + nr_nodes(Cord#cord.left) + nr_nodes(Cord#cord.right). + +mean_leaf_size(Cord) -> + Sizes = leaf_sizes(Cord), + Sum = lists:foldr(fun(X, Acc) -> X + Acc end, + 0, + Sizes), + round(Sum / length(Sizes)). + +leaf_sizes(Cord) when binary(Cord) -> + [size(Cord)]; +leaf_sizes(Cord) -> + leaf_sizes(Cord#cord.left) ++ leaf_sizes(Cord#cord.right). + +max(X, Y) when X > Y -> X; +max(X, Y) -> Y. + +insert(Cord, New, Point) -> + replace(Cord, New, Point, 0). + +delete(Cord, Point, Length) -> + replace(Cord, [], Point, Length). + +%% replace/4: Replace a region of the cord. `New' is the text to +%% replace the region with, and can be either a list, binary, or cord. +replace(Cord, New, Start, Length) when list(New) -> + replace(Cord, list_to_binary(New), Start, Length); +replace(Cord, New, Start, Length) -> + %% Replace is done by copying the areas on the left and right of + %% the region, and joining them together with the new cord in the + %% middle. + {A, B} = split(Cord, Start-1), + {C, D} = split(B, Length), + fix_cord(make_cord(make_cord(A, New), D)). + +split(Cord, 0) when binary(Cord) -> + {<<>>, Cord}; +split(Cord, Pos) when binary(Cord) -> + ?assert(Pos =< cord_size(Cord)), + <<Left:Pos/binary, Right/binary>> = Cord, + {Left, Right}; + +split(Cord, Pos) when record(Cord, cord) -> + ?assert(Pos =< cord_size(Cord)), + LeftSz = cord_size(Cord#cord.left), + RightSz = cord_size(Cord#cord.right), + %%io:format("Split - left:~p right:~p~n", [LeftSz, RightSz]), + if LeftSz == Pos -> + {Cord#cord.left, Cord#cord.right}; + LeftSz > Pos -> + {SplitLeft, SplitRight} = split(Cord#cord.left, Pos), + {SplitLeft, make_cord(SplitRight, Cord#cord.right)}; + LeftSz < Pos -> + {SplitLeft, SplitRight} = split(Cord#cord.right, Pos-LeftSz), + {make_cord(Cord#cord.left, SplitLeft), SplitRight} + end. + +%% join two cords together and rebalance. +join(Left, Right) when binary(Left) -> + fix_cord(make_cord(Left, Right)). + +%% fix_cord/1 +%% +%% "Fix" a cord so that it's reasonably balanced, and it's leaves are +%% reasonable sizes. + +%% Leaf (binary) - break it up if it's too big +fix_cord(Bin) when binary(Bin) -> + if size(Bin) > ?MAX_SIZE -> + {Left, Right} = split(Bin, round(size(Bin) / 2)), + fix_cord(make_cord(Left, Right)); + true -> + Bin + end; +fix_cord(Cord) when Cord#cord.dirty == false -> + Cord; +%% Branch (cord) - merge its children if they're too small, balance it +%% if it's too unbalanced. +fix_cord(Cord) when record(Cord, cord) -> + Sz = cord_size(Cord), + Left = Cord#cord.left, + Right = Cord#cord.right, + LeftSz = cord_size(Left), + RightSz = cord_size(Right), + SzDiff = abs(LeftSz - RightSz), + if Sz < ?MIN_SIZE -> + %% Too small - make it into a binary + to_binary(Cord); + SzDiff > (Sz/3) -> + %% needs rebalancing + if LeftSz > RightSz -> + if binary(Left) -> + fix_cord(to_binary(Cord)); + true -> + balance_from_left(Cord) + end; + LeftSz =< RightSz -> + if binary(Right) -> + fix_cord(to_binary(Cord)); + true -> + balance_from_right(Cord) + end + end; + true -> + %% this cord is ok, fix the children + Cord#cord{left=fix_cord(Left), + right=fix_cord(Right), + dirty=false} + end. + +%% Balance by taking from the left side. +%% Left must be a #cord, right can be a binary. +balance_from_left(#cord{left=Left, right=Right}) when record(Left, cord) -> + LLSz = cord_size(Left#cord.left), + LRSz = cord_size(Left#cord.right), + if + LRSz < LLSz -> + %% single rotate + fix_cord(make_cord(Left#cord.left, + make_cord(Left#cord.right, + Right))); + record(Left#cord.right, cord) -> + %% double rotate + LeftRight = Left#cord.right, + fix_cord(make_cord(make_cord(Left#cord.left, + LeftRight#cord.left), + make_cord(LeftRight#cord.right, + Right))); + true -> + fix_cord(to_binary(make_cord(Left, Right))) + end. + +%% oh, pain, duplication. never have been good at taking redundancy +%% out of symmetric functions. -luke +balance_from_right(#cord{left=Left, right=Right}) when record(Right, cord) -> + RLSz = cord_size(Right#cord.left), + RRSz = cord_size(Right#cord.right), + if + RLSz < RRSz -> + %% single rotate + fix_cord(make_cord(make_cord(Left, + Right#cord.left), + Right#cord.right)); + record(Right#cord.left, cord) -> + %% double rotate + RightLeft = Right#cord.left, + fix_cord(make_cord(make_cord(Left, + RightLeft#cord.left), + make_cord(RightLeft#cord.right, + Right#cord.right))); + true -> + fix_cord(to_binary(make_cord(Left, Right))) + end. + +%% Return: cord() +region(Cord, Start, Length) -> + {A, B} = split(Cord, Start-1), + {C, D} = split(B, Length), + C. + +%% Return: binary() +region_binary(Cord, Start, Length) -> + to_binary(region(Cord, Start, Length)). + +%% Return: list() +region_list(Cord, Start, Length) -> + binary_to_list(region_binary(Cord, Start, Length)). + +to_binary(Cord) when binary(Cord) -> + Cord; +to_binary(Cord) -> + list_to_binary(to_binary1(Cord)). + +to_binary1(Cord) when binary(Cord) -> + Cord; +to_binary1(Cord) when record(Cord, cord) -> + [to_binary1(Cord#cord.left),to_binary1(Cord#cord.right)]. + +to_list(Cord) -> + binary_to_list(to_binary(Cord)). + +to_iolist(Cord) when binary(Cord) -> + [Cord]; +to_iolist(Cord) -> + [to_iolist(Cord#cord.left) | to_iolist(Cord#cord.right)]. + +%% Walk backwards along a cord, character by character. +%% F = fun(X) -> {more, F2} | {result, R} +%% X = char() | finish +walk(Cord, Pos, Direction, F) -> + %% Make this simple: extract the region we want to walk along. + Region = case Direction of + backward -> + {A, B} = split(Cord, Pos), + A; + forward -> + {A, B} = split(Cord, Pos-1), + B + end, + case walk1(Region, Direction, F) of + {result, R} -> + R; + {more, FNext} -> + {result, R} = FNext(finish), + R + end. + +walk1(<<>>, Direction, F) -> + {more, F}; +walk1(Bin, Direction, F) when binary(Bin) -> + {Chunk, Char} = case Direction of + backward -> + Sz = size(Bin) - 1, + <<Front:Sz/binary, Back>> = Bin, + {Front, Back}; + forward -> + <<Front, Back/binary>> = Bin, + {Back, Front} + end, + case F(Char) of + {more, F2} -> + walk1(Chunk, Direction, F2); + {result, R} -> + {result, R} + end; +walk1(Cord, Direction, F) when record(Cord, cord) -> + {First, Second} = case Direction of + backward -> + {Cord#cord.right, Cord#cord.left}; + forward -> + {Cord#cord.left, Cord#cord.right} + end, + case walk1(First, Direction, F) of + {more, F2} -> + walk1(Second, Direction, F2); + {result, R} -> + {result, R} + end. + +walker(Cord) -> + walker(Cord, forward). + +walker(Cord, Direction) -> + walker(Cord, Direction, <<>>). + +walker(Cord, Direction, More) when binary(Cord) -> + {Cord, Direction, More}; +walker(Cord, forward, More) -> + walker(Cord#cord.left, forward, make_cord(Cord#cord.right, More)); +walker(Cord, backward, More) -> + walker(Cord#cord.right, backward, make_cord(More, Cord#cord.left)). + +walker_at_end({walked_to_end, _}) -> + true; +walker_at_end({C,_,Rest}) -> + cord_size(C) + cord_size(Rest) == 0. + +walker_direction({walked_to_end, Direction}) -> Direction; +walker_direction({_, Direction, _}) -> Direction. + +walker_next({walked_to_end, Direction}) -> + {done, {walked_to_end, Direction}}; +walker_next({<<>>, Direction, More}) -> + case cord_size(More) of + 0 -> + {done, {walked_to_end, Direction}}; + _ -> + walker_next(walker(More, Direction)) + end; +walker_next({<<A, Chunk/binary>>, forward, More}) -> + {A, {Chunk, forward, More}}; +walker_next({Bin, backward, More}) -> + ChunkSz = size(Bin) - 1, + <<Chunk:ChunkSz/binary, A>> = Bin, + {A, {Chunk, backward, More}}. + +walker_push(done, Walker) -> + Walker; +walker_push(X, {walked_to_end, Dir}) -> + {<<X>>, Dir, <<>>}; +walker_push(X, {Bin, forward, More}) -> + {<<X>>, forward, make_cord(Bin, More)}; +walker_push(X, {Bin, backward, More}) -> + {<<X>>, backward, make_cord(Bin, More)}. + +walker_test() -> + Cord = make_cord(make_cord(<<1,2>>, <<3>>), + make_cord(<<4,5,6>>, <<7,8,9>>)), + W = walker(Cord, forward), + walker_test_loop(W). + +walker_test_loop(W) -> + case walker_next(W) of + {done, _} -> + []; + {Ch, W2} -> + [Ch|walker_test_loop(W2)] + end. + +test() -> + %% Test of binary cords + BinCord = <<1, 2, 3, 4, 5>> , + {BinCord, <<>>} = split(BinCord, size(BinCord)), + {<<>>, BinCord} = split(BinCord, 0), + {<<1, 2>>, <<3, 4, 5>>} = split(BinCord, 2), + %% Test of a simple cord + Cord1 = make_cord(<<1, 2>>, <<3, 4, 5>>), + {<<>>, <<1, 2, 3, 4, 5>>} = binsplit(Cord1, 0), + {<<1, 2>>, <<3, 4, 5>>} = binsplit(Cord1, 2), + {<<1, 2, 3>>, <<4, 5>>} = binsplit(Cord1, 3), + %% A less trivial cord + %% (spaces before commas are to workaround an erlang-mode indent problem) + A = <<1, 2>> , + B = <<3, 4>> , + C = <<5, 6>> , + D = <<7, 8>> , + E = <<9>> , + Cord2 = make_cord(make_cord(A, B), make_cord(C, make_cord(D, E))), + {<<>>, <<1, 2, 3, 4, 5, 6, 7, 8, 9>>} = binsplit(Cord2, 0), + {<<1, 2, 3, 4, 5>>, <<6, 7, 8, 9>>} = binsplit(Cord2, 5), + %% Joining + BinCord2 = to_binary(Cord2), + %% Why does = fail but == work? + true = <<BinCord/binary, BinCord2/binary>> == binjoin(BinCord, Cord2), + true = <<BinCord2/binary, BinCord/binary>> == binjoin(Cord2, Cord1), + true = <<BinCord2/binary, BinCord2/binary>> == binjoin(Cord2, Cord2), + %% Test "walking" + [1,2,3,4,5] = walk(Cord2, 5, backward, walk_test([])), + [9, 8, 7, 6, 5] = walk(Cord2, 5, forward, walk_test([])), + %% Test some operations + <<1, 2, 3, 4>> = delete(Cord2, 5, 5), + <<1, 2, 3, + 1, 2, 3, 4, 5, 6, 7, 8, 9, + 4, 5, 6, 7, 8, 9>> = cord:insert(Cord2, Cord2, 4), + ok. + +walk_test(Acc) -> + fun(finish) -> + {result, Acc}; + (X) -> + {more, walk_test([X|Acc])} + end. + +test2() -> + A = <<1, 2, 3, 4>> , + B = <<5, 6>> , + C = <<>> , + D = <<7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17>> , + join(A, join(B, join(C, D))). + +binsplit(Cord, Pos) -> + {Left, Right} = split(Cord, Pos), + {to_binary(Left), to_binary(Right)}. + +binjoin(Left, Right) -> + to_binary(make_cord(Left, Right)). + +benchmark(File) -> + {ok, Cord} = cord:new_from_file(File), + Sz = cord_size(Cord), + random:seed(), + Randoms = [random:uniform(Sz) || _ <- lists:seq(1, 100)], + timer:tc(?MODULE, split_with_each, [Cord, Randoms, 10]). + +split_with_each(Cord, L, N) -> + split_with_each(Cord, L, L, N). + +split_with_each(Cord, _, _, 0) -> + ok; +split_with_each(Cord, [H|T], L, N) -> + split(Cord, H), + split_with_each(Cord, T, L, N); +split_with_each(Cord, [], L, N) -> + split_with_each(Cord, L, L, N-1). + + |