Switch to using skip lists from leveled_tree

Remove now unused leveled_skiplist and leveled_tinybloom
2017-01-21 22:34:56 +00:00 · 2017-01-21 22:34:56 +00:00 · 58cda7d157
commit 58cda7d157
parent 6d2eb1d57c
4 changed files with 34 additions and 835 deletions
--- a/include/leveled.hrl
+++ b/include/leveled.hrl
@ -15,7 +15,7 @@
 %% Inker key type used for tombstones
 -define(INKT_TOMB, tomb).
-define(CACHE_TYPE, idxt).
+-define(CACHE_TYPE, skpl).
 -record(sft_options,
                        {wait = true :: boolean(),
--- a/src/leveled_skiplist.erl
+++ b/src/leveled_skiplist.erl
@ -1,661 +0,0 @@
 %% -------- SKIPLIST ---------
 %%
 %% For storing small numbers of {K, V} pairs where reasonable insertion and
 %% fetch times, but with fast support for flattening to a list or a sublist
 %% within a certain key range
 %%
 %% Used instead of gb_trees to retain compatability of OTP16 (and Riak's
 %% ongoing dependency on OTP16)
 %%
 %% Not a proper skip list.  Only supports a fixed depth.  Good enough for the
 %% purposes of leveled.  Also uses peculiar enkey_passed function within
 %% leveled.  Not tested beyond a depth of 2.
 -module(leveled_skiplist).
 -include("include/leveled.hrl").
 -export([
        from_list/1,
        from_list/2,
        from_sortedlist/1,
        from_sortedlist/2,
        from_orderedset/1,
        from_orderedset/2,
        to_list/1,
        enter/3,
        enter/4,
        enter_nolookup/3,
        to_range/2,
        to_range/3,
        lookup/2,
        lookup/3,
        empty/0,
        empty/1,
        size/1
        ]).      
 -include_lib("eunit/include/eunit.hrl").
 -define(SKIP_WIDTH, 16).
 -define(LIST_HEIGHT, 2).
 -define(INFINITY_KEY, {null, null, null, null, null}).
 -define(BITARRAY_SIZE, 2048).
 %%%============================================================================
 %%% SkipList API
 %%%============================================================================
 enter(Key, Value, SkipList) ->
    Hash = leveled_codec:magic_hash(Key),
    enter(Key, Hash, Value, SkipList).
 enter(Key, Hash, Value, SkipList) ->
    Bloom0 =
        case element(1, SkipList) of
            list_only ->
                list_only;
            Bloom ->
                leveled_tinybloom:enter({hash, Hash}, Bloom)
        end,
    {Bloom0,
        enter(Key, Value, erlang:phash2(Key),
                element(2, SkipList),
                ?SKIP_WIDTH, ?LIST_HEIGHT)}.
 %% Can iterate over a key entered this way, but never lookup the key
 %% used for index terms
 %% The key may still be a marker key - and the much cheaper native hash
 %% is used to dtermine this, avoiding the more expensive magic hash
 enter_nolookup(Key, Value, SkipList) ->
    {element(1, SkipList),
        enter(Key, Value, erlang:phash2(Key),
                element(2, SkipList),
                ?SKIP_WIDTH, ?LIST_HEIGHT)}.
 from_orderedset(Table) ->
    from_orderedset(Table, false).
 from_orderedset(Table, Bloom) ->
    from_sortedlist(ets:tab2list(Table), Bloom).
 from_list(UnsortedKVL) ->
    from_list(UnsortedKVL, false).
 from_list(UnsortedKVL, BloomProtect) ->
    KVL = lists:ukeysort(1, UnsortedKVL),
    from_sortedlist(KVL, BloomProtect).
 from_sortedlist(SortedKVL) ->
    from_sortedlist(SortedKVL, false).
 from_sortedlist([], BloomProtect) ->
    empty(BloomProtect);
 from_sortedlist(SortedKVL, BloomProtect) ->
    Bloom0 =
        case BloomProtect of
            true ->
                lists:foldr(fun({K, _V}, Bloom) ->
                                        leveled_tinybloom:enter(K, Bloom) end,
                                    leveled_tinybloom:empty(?SKIP_WIDTH),
                                    SortedKVL);
            false ->
                list_only
    end,
    {Bloom0, from_list(SortedKVL, ?SKIP_WIDTH, ?LIST_HEIGHT)}.
 lookup(Key, SkipList) ->
    case element(1, SkipList) of
        list_only ->
            list_lookup(Key, element(2, SkipList), ?LIST_HEIGHT);
        _ ->
            lookup(Key, leveled_codec:magic_hash(Key), SkipList)
    end.
 lookup(Key, Hash, SkipList) ->
    case element(1, SkipList) of
        list_only ->
            list_lookup(Key, element(2, SkipList), ?LIST_HEIGHT);
        _ ->
            case leveled_tinybloom:check({hash, Hash}, element(1, SkipList)) of
                false ->
                    none;
                true ->
                    list_lookup(Key, element(2, SkipList), ?LIST_HEIGHT)
            end
    end.
 %% Rather than support iterator_from like gb_trees, will just an output a key
 %% sorted list for the desired range, which can the be iterated over as normal
 to_range(SkipList, Start) ->
    to_range(element(2, SkipList), Start, ?INFINITY_KEY, ?LIST_HEIGHT).
 to_range(SkipList, Start, End) ->
    to_range(element(2, SkipList), Start, End, ?LIST_HEIGHT).
 to_list(SkipList) ->
    to_list(element(2, SkipList), ?LIST_HEIGHT).
 empty() ->
    empty(false).
 empty(BloomProtect) ->
    case BloomProtect of
        true ->
            {leveled_tinybloom:empty(?SKIP_WIDTH),
                empty([], ?LIST_HEIGHT)};
        false ->
            {list_only, empty([], ?LIST_HEIGHT)}
    end.
 size(SkipList) ->
    size(element(2, SkipList), ?LIST_HEIGHT).
 %%%============================================================================
 %%% SkipList Base Functions
 %%%============================================================================
 enter(Key, Value, Hash, SkipList, Width, 1) ->
    {MarkerKey, SubList} = find_mark(Key, SkipList),
    case Hash rem Width of
        0 ->
            {LHS, RHS} = lists:splitwith(fun({K, _V}) ->
                                                K =< Key end,
                                            SubList),
            SkpL1 = lists:keyreplace(MarkerKey, 1, SkipList, {MarkerKey, RHS}),
            SkpL2 = [{Key, lists:ukeysort(1, [{Key, Value}|LHS])}|SkpL1],
            lists:ukeysort(1, SkpL2);
        _ ->
            {LHS, RHS} = lists:splitwith(fun({K, _V}) -> K < Key end, SubList),
            UpdSubList =
                case RHS of
                    [] ->
                        LHS ++ [{Key, Value}];
                    [{FirstKey, _V}|RHSTail] ->
                        case FirstKey of
                            Key ->
                                LHS ++ [{Key, Value}] ++ RHSTail;
                            _ ->
                                LHS ++ [{Key, Value}] ++ RHS
                        end
                end,        
            lists:keyreplace(MarkerKey, 1, SkipList, {MarkerKey, UpdSubList})
    end;
 enter(Key, Value, Hash, SkipList, Width, Level) ->
    HashMatch = width(Level, Width),
    {MarkerKey, SubSkipList} = find_mark(Key, SkipList),
    UpdSubSkipList = enter(Key, Value, Hash, SubSkipList, Width, Level - 1),
    case Hash rem HashMatch of
        0 ->
            % 
            {LHS, RHS} = lists:splitwith(fun({K, _V}) ->
                                                K =< Key end,
                                            UpdSubSkipList),
            SkpL1 = lists:keyreplace(MarkerKey, 1, SkipList, {MarkerKey, RHS}),
            lists:ukeysort(1, [{Key, LHS}|SkpL1]);
        _ ->
            % Need to replace Marker Key with sublist
            lists:keyreplace(MarkerKey,
                                1,
                                SkipList,
                                {MarkerKey, UpdSubSkipList})
    end.
 from_list(SkipList, _SkipWidth, 0) ->
    SkipList;
 from_list(KVList, SkipWidth, ListHeight) ->
    L0 = length(KVList),
    SL0 =
        case L0 > SkipWidth of
            true ->
                from_list(KVList, L0, [], SkipWidth);         
            false ->
                {LastK, _LastSL} = lists:last(KVList),
                [{LastK, KVList}]
        end,
    from_list(SL0, SkipWidth, ListHeight - 1).
 from_list([], 0, SkipList, _SkipWidth) ->
    SkipList;
 from_list(KVList, L, SkipList, SkipWidth) ->
    SubLL = min(SkipWidth, L),
    {Head, Tail} = lists:split(SubLL, KVList),
    {LastK, _LastV} = lists:last(Head),
    from_list(Tail, L - SubLL, SkipList ++ [{LastK, Head}], SkipWidth).
 list_lookup(Key, SkipList, 1) ->
    SubList = get_sublist(Key, SkipList),
    case lists:keyfind(Key, 1, SubList) of
        false ->
            none;
        {Key, V} ->
            {value, V}
    end;
 list_lookup(Key, SkipList, Level) ->
    SubList = get_sublist(Key, SkipList),
    case SubList of
        null ->
            none;
        _ ->
            list_lookup(Key, SubList, Level - 1)
    end.
 to_list(SkipList, 1) ->
    lists:foldl(fun({_Mark, SL}, Acc) -> Acc ++ SL end, [], SkipList);
 to_list(SkipList, Level) ->
    lists:foldl(fun({_Mark, SL}, Acc) -> Acc ++ to_list(SL, Level - 1) end,
                [],
                SkipList).
 to_range(SkipList, StartKey, EndKey, ListHeight) ->
    to_range(SkipList, StartKey, EndKey, ListHeight, [], true).
 to_range(SkipList, StartKey, EndKey, ListHeight, Acc, StartIncl) ->
    SL = sublist_above(SkipList, StartKey, ListHeight, StartIncl),
    case SL of
        [] ->
            Acc;
        _ ->
            {LK, _LV} = lists:last(SL),
            case leveled_codec:endkey_passed(EndKey, LK) of
                false ->
                    to_range(SkipList,
                                LK,
                                EndKey,
                                ListHeight,
                                Acc ++ SL,
                                false);
                true ->
                    SplitFun =
                        fun({K, _V}) ->
                                not leveled_codec:endkey_passed(EndKey, K) end,
                    LHS = lists:takewhile(SplitFun, SL),
                    Acc ++ LHS
            end
    end.
 sublist_above(SkipList, StartKey, 0, StartIncl) ->
    TestFun =
        fun({K, _V}) ->
            case StartIncl of
                true ->
                    K < StartKey;
                false ->
                    K =< StartKey
            end end,
    lists:dropwhile(TestFun, SkipList);
 sublist_above(SkipList, StartKey, Level, StartIncl) ->
    TestFun =
        fun({K, _SL}) ->
            case StartIncl of
                true ->
                    K < StartKey;
                false ->
                    K =< StartKey
            end end,
    RHS = lists:dropwhile(TestFun, SkipList),
    case RHS of
        [] ->
            [];    
        [{_K, SL}|_Rest] ->
            sublist_above(SL, StartKey, Level - 1, StartIncl)
    end.
 empty(SkipList, 1) ->
    [{?INFINITY_KEY, SkipList}];
 empty(SkipList, Level) ->
    empty([{?INFINITY_KEY, SkipList}], Level - 1).
 size(SkipList, 1) ->
    lists:foldl(fun({_Mark, SL}, Acc) -> length(SL) + Acc end, 0, SkipList);
 size(SkipList, Level) ->
    lists:foldl(fun({_Mark, SL}, Acc) -> size(SL, Level - 1) + Acc end,
                    0,
                    SkipList).
 %%%============================================================================
 %%% Internal Functions
 %%%============================================================================
 width(1, Width) ->
    Width;
 width(N, Width) ->
    width(N - 1, Width * Width).
 find_mark(Key, SkipList) ->
    lists:foldl(fun({Marker, SL}, Acc) ->
                    case Acc of
                        false ->
                            case Marker >= Key of
                                true ->
                                   {Marker, SL};
                                false ->
                                    Acc
                            end;
                        _ ->
                            Acc
                    end end,
                false,
                SkipList).
 get_sublist(Key, SkipList) ->
    lists:foldl(fun({SkipKey, SL}, Acc) ->
                        case {Acc, SkipKey} of
                            {null, SkipKey} when SkipKey >= Key ->
                                SL;
                            _ ->
                                Acc
                        end end,
                    null,
                    SkipList).
 %%%============================================================================
 %%% Test
 %%%============================================================================
 -ifdef(TEST).
 generate_randomkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) ->
    generate_randomkeys(Seqn,
                        Count,
                        [],
                        BucketRangeLow,
                        BucketRangeHigh).
 generate_randomkeys(_Seqn, 0, Acc, _BucketLow, _BucketHigh) ->
    Acc;
 generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) ->
    BNumber =
        case BRange of
            0 ->
                string:right(integer_to_list(BucketLow), 4, $0);
            _ ->
                BRand = random:uniform(BRange),
                string:right(integer_to_list(BucketLow + BRand), 4, $0)
        end,
    KNumber = string:right(integer_to_list(random:uniform(1000)), 4, $0),
    {K, V} = {{o, "Bucket" ++ BNumber, "Key" ++ KNumber, null},
                {Seqn, {active, infinity}, null}},
    generate_randomkeys(Seqn + 1,
                        Count - 1,
                        [{K, V}|Acc],
                        BucketLow,
                        BRange).
 skiplist_small_test() ->
    % Check nothing bad happens with very small lists
    lists:foreach(fun(N) -> dotest_skiplist_small(N) end, lists:seq(1, 32)).
 dotest_skiplist_small(N) ->
    KL = generate_randomkeys(1, N, 1, 2),
    SkipList1 = 
        lists:foldl(fun({K, V}, SL) ->
                            enter(K, V, SL)
                            end,
                        empty(),
                        KL),
    SkipList2 = from_list(lists:reverse(KL)),
    lists:foreach(fun({K, V}) -> ?assertMatch({value, V}, lookup(K, SkipList1))
                                    end,
                    lists:ukeysort(1, lists:reverse(KL))),
    lists:foreach(fun({K, V}) -> ?assertMatch({value, V}, lookup(K, SkipList2))
                                    end,
                    lists:ukeysort(1, lists:reverse(KL))).
 skiplist_withbloom_test() ->
    io:format(user, "~n~nBloom protected skiplist test:~n~n", []),
    skiplist_tester(true).
 skiplist_nobloom_test() ->
    io:format(user, "~n~nBloom free skiplist test:~n~n", []),
    skiplist_tester(false).
 skiplist_tester(Bloom) ->
    N = 4000,
    KL = generate_randomkeys(1, N, 1, N div 5),
    OS = ets:new(test, [ordered_set, private]),
    ets:insert(OS, KL),
    SWaETS = os:timestamp(),
    SkipList = from_orderedset(OS, Bloom),
    io:format(user, "Generating skip list with ~w keys in ~w microseconds " ++
                        "from ordered set~n",
                [N, timer:now_diff(os:timestamp(), SWaETS)]),
    SWaGSL = os:timestamp(),
    SkipList = from_list(lists:reverse(KL), Bloom),
    io:format(user, "Generating skip list with ~w keys in ~w microseconds~n" ++
                        "Top level key count of ~w~n",
                [N,
                    timer:now_diff(os:timestamp(), SWaGSL),
                    length(element(2, SkipList))]),
    io:format(user, "Second tier key counts of ~w~n",
                [lists:map(fun({_L, SL}) -> length(SL) end,
                    element(2, SkipList))]),
    KLSorted = lists:ukeysort(1, lists:reverse(KL)),
    SWaGSL2 = os:timestamp(),
    SkipList = from_sortedlist(KLSorted, Bloom),
    io:format(user, "Generating skip list with ~w sorted keys in ~w " ++
                        "microseconds~n",
                [N, timer:now_diff(os:timestamp(), SWaGSL2)]),
    SWaDSL = os:timestamp(),
    SkipList1 = 
        lists:foldl(fun({K, V}, SL) ->
                            enter(K, V, SL)
                            end,
                        empty(Bloom),
                        KL),
    io:format(user, "Dynamic load of skiplist with ~w keys took ~w " ++
                        "microseconds~n" ++
                        "Top level key count of ~w~n",
                [N,
                    timer:now_diff(os:timestamp(), SWaDSL),
                    length(element(2, SkipList1))]),
       io:format(user, "Second tier key counts of ~w~n",
                [lists:map(fun({_L, SL}) -> length(SL) end,
                    element(2, SkipList1))]),
    io:format(user, "~nRunning timing tests for generated skiplist:~n", []),
    skiplist_timingtest(KLSorted, SkipList, N, Bloom),
    io:format(user, "~nRunning timing tests for dynamic skiplist:~n", []),
    skiplist_timingtest(KLSorted, SkipList1, N, Bloom).
 skiplist_timingtest(KL, SkipList, N, Bloom) ->
    io:format(user, "Timing tests on skiplist of size ~w~n",
                [leveled_skiplist:size(SkipList)]),
    CheckList1 = lists:sublist(KL, N div 4, 200),
    CheckList2 = lists:sublist(KL, N div 3, 200),
    CheckList3 = lists:sublist(KL, N div 2, 200),
    CheckList4 = lists:sublist(KL, N - 1000, 200),
    CheckList5 = lists:sublist(KL, N - 500, 200),
    CheckList6 = lists:sublist(KL, 1, 10),
    CheckList7 = lists:nthtail(N - 200, KL),
    CheckList8 = lists:sublist(KL, N div 2, 1),
    CheckAll = CheckList1 ++ CheckList2 ++ CheckList3 ++
                    CheckList4 ++ CheckList5 ++ CheckList6 ++ CheckList7,
    SWb = os:timestamp(),
    lists:foreach(fun({K, V}) ->
                        ?assertMatch({value, V}, lookup(K, SkipList))
                        end,
                    CheckAll),
    io:format(user, "Finding 1020 keys took ~w microseconds~n",
                [timer:now_diff(os:timestamp(), SWb)]),
    RangeFun =
        fun(SkipListToQuery, CheckListForQ, Assert) ->
            KR =
                to_range(SkipListToQuery,
                            element(1, lists:nth(1, CheckListForQ)),
                            element(1, lists:last(CheckListForQ))),
            case Assert of
                true ->
                    CompareL = length(lists:usort(CheckListForQ)),
                    ?assertMatch(CompareL, length(KR));
                false ->
                    KR
            end
            end,
    SWc = os:timestamp(),
    RangeFun(SkipList, CheckList1, true),
    RangeFun(SkipList, CheckList2, true),
    RangeFun(SkipList, CheckList3, true),
    RangeFun(SkipList, CheckList4, true),
    RangeFun(SkipList, CheckList5, true),
    RangeFun(SkipList, CheckList6, true),
    RangeFun(SkipList, CheckList7, true),
    RangeFun(SkipList, CheckList8, true),
    KL_OOR1 = generate_randomkeys(1, 4, N div 5 + 1, N div 5 + 10),
    KR9 = RangeFun(SkipList, KL_OOR1, false),
    ?assertMatch([], KR9),
    KL_OOR2 = generate_randomkeys(1, 4, 0, 0),
    KR10 = RangeFun(SkipList, KL_OOR2, false),
    ?assertMatch([], KR10),
    io:format(user, "Finding 10 ranges took ~w microseconds~n",
                [timer:now_diff(os:timestamp(), SWc)]),
    AltKL1 = generate_randomkeys(1, 2000, 1, 200),
    SWd0 = os:timestamp(),
    lists:foreach(fun({K, _V}) ->
                        lookup(K, SkipList)
                        end,
                    AltKL1),
    io:format(user, "Getting 2000 mainly missing keys took ~w microseconds~n",
                [timer:now_diff(os:timestamp(), SWd0)]),
    SWd1 = os:timestamp(),
    lists:foreach(fun({K, _V}) ->
                        leveled_codec:magic_hash(K)
                        end,
                    AltKL1),
    io:format(user, "Generating 2000 magic hashes took ~w microseconds~n",
                [timer:now_diff(os:timestamp(), SWd1)]),
    SWd2 = os:timestamp(),
    lists:foreach(fun({K, _V}) ->
                        erlang:phash2(K)
                        end,
                    AltKL1),
    io:format(user, "Generating 2000 not so magic hashes took ~w microseconds~n",
                [timer:now_diff(os:timestamp(), SWd2)]),
    AltKL2 = generate_randomkeys(1, 1000, N div 5 + 1, N div 5 + 300),
    SWe = os:timestamp(),
    lists:foreach(fun({K, _V}) ->
                        none = lookup(K, SkipList)
                        end,
                    AltKL2),
    io:format(user, "Getting 1000 missing keys above range took ~w " ++
                        "microseconds~n",
                [timer:now_diff(os:timestamp(), SWe)]),
    AltKL3 = generate_randomkeys(1, 1000, 0, 0),
    SWf = os:timestamp(),
    lists:foreach(fun({K, _V}) ->
                        none = lookup(K, SkipList)
                        end,
                    AltKL3),
    io:format(user, "Getting 1000 missing keys below range took ~w " ++
                        "microseconds~n",
                [timer:now_diff(os:timestamp(), SWf)]),
    SWg = os:timestamp(),
    FlatList = to_list(SkipList),
    io:format(user, "Flattening skiplist took ~w microseconds~n",
                [timer:now_diff(os:timestamp(), SWg)]),
    ?assertMatch(KL, FlatList),
    case Bloom of
        true ->
            HashList = lists:map(fun(_X) ->
                                        random:uniform(4294967295) end,
                                    lists:seq(1, 2000)),
            SWh = os:timestamp(),
            lists:foreach(fun(X) ->
                                lookup(X, X, SkipList) end,
                            HashList),
            io:format(user,
                        "Getting 2000 missing keys when hash was known " ++
                            "took ~w microseconds~n",
                        [timer:now_diff(os:timestamp(), SWh)]);
        false ->
            ok
    end.
 define_kv(X) ->
    {{o, "Bucket", "Key" ++ string:right(integer_to_list(X), 6), null},
        {X, {active, infinity}, null}}.
 skiplist_roundsize_test() ->
    KVL = lists:map(fun(X) -> define_kv(X) end, lists:seq(1, 4096)),
    SkipList = from_list(KVL),
    lists:foreach(fun({K, V}) ->
                        ?assertMatch({value, V}, lookup(K, SkipList)) end,
                    KVL),
    lists:foreach(fun(X) ->
                            {KS, _VS} = define_kv(X * 32 + 1),
                            {KE, _VE} = define_kv((X + 1) * 32),
                            R = to_range(SkipList, KS, KE),
                            L = lists:sublist(KVL,
                                                X * 32 + 1,
                                                32),
                            ?assertMatch(L, R) end,
                        lists:seq(0, 24)).
 skiplist_nolookup_test() ->
    N = 4000,
    KL = generate_randomkeys(1, N, 1, N div 5),
    SkipList = lists:foldl(fun({K, V}, Acc) ->
                                enter_nolookup(K, V, Acc) end,
                            empty(true),
                            KL),
    KLSorted = lists:ukeysort(1, lists:reverse(KL)),
    lists:foreach(fun({K, _V}) ->
                        ?assertMatch(none, lookup(K, SkipList)) end,
                        KL),
    ?assertMatch(KLSorted, to_list(SkipList)).
 skiplist_range_test() ->
    N = 150,
    KL = generate_randomkeys(1, N, 1, N div 5),
    KLSL1 = lists:sublist(lists:ukeysort(1, KL), 128),
    SkipList1 = from_list(KLSL1),
    {LastK1, V1} = lists:last(KLSL1),
    R1 = to_range(SkipList1, LastK1, LastK1),
    ?assertMatch([{LastK1, V1}], R1),
    KLSL2 = lists:sublist(lists:ukeysort(1, KL), 127),
    SkipList2 = from_list(KLSL2),
    {LastK2, V2} = lists:last(KLSL2),
    R2 = to_range(SkipList2, LastK2, LastK2),
    ?assertMatch([{LastK2, V2}], R2),
    KLSL3 = lists:sublist(lists:ukeysort(1, KL), 129),
    SkipList3 = from_list(KLSL3),
    {LastK3, V3} = lists:last(KLSL3),
    R3 = to_range(SkipList3, LastK3, LastK3),
    ?assertMatch([{LastK3, V3}], R3),
    {FirstK4, V4} = lists:nth(1, KLSL3),
    R4 = to_range(SkipList3, FirstK4, FirstK4),
    ?assertMatch([{FirstK4, V4}], R4).
 empty_skiplist_size_test() ->
    ?assertMatch(0, leveled_skiplist:size(empty(false))),
    ?assertMatch(0, leveled_skiplist:size(empty(true))).
 -endif.
--- a/src/leveled_tinybloom.erl
+++ b/src/leveled_tinybloom.erl
@ -1,159 +0,0 @@
 %% -------- TINY BLOOM ---------
 %%
 %% For sheltering relatively expensive lookups with a probabilistic check
 %%
 %% Uses multiple 512 byte blooms.  Can sensibly hold up to 1000 keys per array.
 %% Even at 1000 keys should still offer only a 20% false positive
 %%
 %% Restricted to no more than 256 arrays - so can't handle more than 250K keys
 %% in total
 %%
 %% Implemented this way to make it easy to control false positive (just by
 %% setting the width).  Also only requires binary manipulations of a single
 %% hash
 -module(leveled_tinybloom).
 -include("include/leveled.hrl").
 -export([
        enter/2,
        check/2,
        empty/1
        ]).      
 -include_lib("eunit/include/eunit.hrl").
 %%%============================================================================
 %%% Bloom API
 %%%============================================================================
 empty(Width) when Width =< 256 ->
    FoldFun = fun(X, Acc) -> dict:store(X, <<0:4096>>, Acc) end,
    lists:foldl(FoldFun, dict:new(), lists:seq(0, Width - 1)).
 enter({hash, no_lookup}, Bloom) ->
    Bloom;
 enter({hash, Hash}, Bloom) ->
    {Slot0, Bit1, Bit2} = split_hash(Hash),
    Slot = Slot0 rem dict:size(Bloom),
    BitArray0 = dict:fetch(Slot, Bloom),
    FoldFun =
        fun(Bit, Arr) -> add_to_array(Bit, Arr, 4096) end,
    BitArray1 = lists:foldl(FoldFun,
                                BitArray0,
                                lists:usort([Bit1, Bit2])),
    dict:store(Slot, <<BitArray1/binary>>, Bloom);
 enter(Key, Bloom) ->
    Hash = leveled_codec:magic_hash(Key),
    enter({hash, Hash}, Bloom).
 check({hash, Hash}, Bloom) ->
    {Slot0, Bit1, Bit2} = split_hash(Hash),
    Slot = Slot0 rem dict:size(Bloom),
    BitArray = dict:fetch(Slot, Bloom),
    case getbit(Bit1, BitArray, 4096) of
        <<0:1>> ->
            false;
        <<1:1>> ->
            case getbit(Bit2, BitArray, 4096) of
                <<0:1>> ->
                    false;
                <<1:1>> ->
                    true
            end
    end;
 check(Key, Bloom) ->
    Hash = leveled_codec:magic_hash(Key),
    check({hash, Hash}, Bloom).
 %%%============================================================================
 %%% Internal Functions
 %%%============================================================================
 split_hash(Hash) ->
    H0 = Hash band 255,
    H1 = (Hash bsr 8) band 4095,
    H2 = Hash bsr 20,
    {H0, H1, H2}.
 add_to_array(Bit, BitArray, ArrayLength) ->
    RestLen = ArrayLength - Bit - 1,
    <<Head:Bit/bitstring,
        _B:1/integer,
        Rest:RestLen/bitstring>> = BitArray,
    <<Head/bitstring, 1:1, Rest/bitstring>>.
 getbit(Bit, BitArray, ArrayLength) ->
    RestLen = ArrayLength - Bit - 1,
    <<_Head:Bit/bitstring,
        B:1/bitstring,
        _Rest:RestLen/bitstring>> = BitArray,
    B.
 %%%============================================================================
 %%% Test
 %%%============================================================================
 -ifdef(TEST).
 simple_test() ->
    N = 4000,
    W = 6,
    KLin = lists:map(fun(X) -> "Key_" ++
                                integer_to_list(X) ++
                                integer_to_list(random:uniform(100)) ++
                                binary_to_list(crypto:rand_bytes(2))
                                end,
                        lists:seq(1, N)),
    KLout = lists:map(fun(X) ->
                            "NotKey_" ++
                            integer_to_list(X) ++
                            integer_to_list(random:uniform(100)) ++
                            binary_to_list(crypto:rand_bytes(2))
                            end,
                        lists:seq(1, N)),
    SW0_PH = os:timestamp(),
    lists:foreach(fun(X) -> erlang:phash2(X) end, KLin),
    io:format(user,
                "~nNative hash function hashes ~w keys in ~w microseconds~n",
                [N, timer:now_diff(os:timestamp(), SW0_PH)]),
    SW0_MH = os:timestamp(),
    lists:foreach(fun(X) -> leveled_codec:magic_hash(X) end, KLin),
    io:format(user,
                "~nMagic hash function hashes ~w keys in ~w microseconds~n",
                [N, timer:now_diff(os:timestamp(), SW0_MH)]),
    SW1 = os:timestamp(),
    Bloom = lists:foldr(fun enter/2, empty(W), KLin),
    io:format(user,
                "~nAdding ~w keys to bloom took ~w microseconds~n",
                [N, timer:now_diff(os:timestamp(), SW1)]),
    SW2 = os:timestamp(),
    lists:foreach(fun(X) -> ?assertMatch(true, check(X, Bloom)) end, KLin),
    io:format(user,
                "~nChecking ~w keys in bloom took ~w microseconds~n",
                [N, timer:now_diff(os:timestamp(), SW2)]),
    SW3 = os:timestamp(),
    FP = lists:foldr(fun(X, Acc) -> case check(X, Bloom) of
                                        true -> Acc + 1;
                                        false -> Acc
                                    end end,
                        0,
                        KLout),
    io:format(user,
                "~nChecking ~w keys out of bloom took ~w microseconds " ++
                    "with ~w false positive rate~n",
                [N, timer:now_diff(os:timestamp(), SW3), FP / N]),
    ?assertMatch(true, FP < (N div 4)).
 -endif.
--- a/src/leveled_tree.erl
+++ b/src/leveled_tree.erl
@ -162,7 +162,17 @@ to_list({tree, _L, Tree}) ->
        end,
    lists:foldl(FoldFun, [], tree_to_list(Tree));
 to_list({idxt, _L, {TLI, _IDX}}) ->
-    lists:append(tuple_to_list(TLI)).
+    lists:append(tuple_to_list(TLI));
 to_list({skpl, _L, SkipList}) ->
    FoldFun = 
        fun({_M, SL}, Acc) ->
            [SL|Acc]
        end,
    Lv1List = lists:reverse(lists:foldl(FoldFun, [], SkipList)),
    Lv0List = lists:reverse(lists:foldl(FoldFun, [], lists:append(Lv1List))),
    lists:append(Lv0List).
 tsize({_Type, L, _Tree}) ->
@ -171,7 +181,9 @@ tsize({_Type, L, _Tree}) ->
 empty(tree) ->
    {tree, 0, empty_tree()};
 empty(idxt) ->
-    {idxt, 0, {{}, empty_tree()}}.
+    {idxt, 0, {{}, empty_tree()}};
 empty(skpl) ->
    {skpl, 0, []}.
 %%%============================================================================
 %%% Internal Functions
@ -216,14 +228,22 @@ roll_list(KVList, L, SkipList, SkipWidth) ->
-lookup_match(_Key, []) ->
+% lookup_match(_Key, []) ->
-    none;
+%     none;
-lookup_match(Key, [{EK, _EV}|_Tail]) when EK > Key ->
+% lookup_match(Key, [{EK, _EV}|_Tail]) when EK > Key ->
-    none;
+%     none;
-lookup_match(Key, [{Key, EV}|_Tail]) ->
+% lookup_match(Key, [{Key, EV}|_Tail]) ->
-    {value, EV};
+%     {value, EV};
-lookup_match(Key, [_Top|Tail]) ->
+% lookup_match(Key, [_Top|Tail]) ->
-    lookup_match(Key, Tail).
+%     lookup_match(Key, Tail).
 lookup_match(Key, KVList) ->
    case lists:keyfind(Key, 1, KVList) of 
        false ->
            none;
        {Key, Value} ->
            {value, Value}
    end.
 lookup_best(Key, [{EK, EV}|_Tail]) when EK >= Key ->
    {EK, EV};
@ -396,15 +416,14 @@ skpl_getsublist(Key, SkipList) ->
    FoldFun =
        fun({Mark, SL}, Acc) ->
            case {Acc, Mark} of
-                {none, Mark} when Mark >= Key ->
+                {[], Mark} when Mark >= Key ->
                    SL;
                _ ->
                    Acc
            end
        end,
-    SL1 = lists:foldl(FoldFun, none, SkipList),
+    SL1 = lists:foldl(FoldFun, [], SkipList),
-    lists:foldl(FoldFun, none, SL1).
+    lists:foldl(FoldFun, [], SL1).
 %%%============================================================================
 %%% Balance tree implementation