Alternate tinybloom

Previously the code had used a tiny bloom - but this proved to be expensive to build. Looking at the alternative of a slot-size only tiny bloom
2017-01-24 15:48:12 +00:00 · 2017-01-24 15:48:12 +00:00 · a8488663c7
commit a8488663c7
parent 266e851a96
1 changed files with 230 additions and 0 deletions
--- a/src/leveled_tinybloom.erl
+++ b/src/leveled_tinybloom.erl
@ -0,0 +1,230 @@
 %% -------- TinyBloom ---------
 %%
 %% A fixed size bloom that supports 128 keys only, made to try and minimise
 %% the cost of producing the bloom
 %%
 -module(leveled_tinybloom).
 -include("include/leveled.hrl").
 -define(TWO_POWER,
            list_to_tuple(
                lists:reverse(
                    element(2,
                        lists:foldl(
                                fun(_I, {AccLast, AccList}) ->
                                    {AccLast * 2,
                                        [(AccLast * 2)|AccList]}
                                end,
                            {1, [1]},
                            lists:seq(2, 32))
                        )
                    )
                )
            ).
 -include_lib("eunit/include/eunit.hrl").
 -export([
            create_bloom/1,
            check_hash/2
            ]).
 %%%============================================================================
 %%% API
 %%%============================================================================
 create_bloom(HashList) ->
    SlotSplit = 
        case length(HashList) of
            L when L > 64 ->
                15;
            L when L > 32 ->
                7;
            L when L > 16 ->
                3;
            _ ->
                1
        end,
    add_hashlist(HashList,
                    array:new([{size, SlotSplit + 1}, {default, 0}]),
                    SlotSplit).
 check_hash(Hash, BloomBin) ->
    SlotSplit = (byte_size(BloomBin) div 4) - 1,
    {Slot, H0, H1} = split_hash(Hash, SlotSplit),
    Mask = get_mask(H0, H1),
    Pos = Slot * 4,
    <<_H:Pos/binary, CheckInt:32/integer, _T/binary>> = BloomBin,
    case CheckInt band Mask of
        Mask ->
            true;
        _ ->
            false
    end.
 %%%============================================================================
 %%% Internal Functions
 %%%============================================================================
 split_hash(Hash, SlotSplit) ->
    Slot = Hash band SlotSplit,
    H0 = (Hash bsr 4) band 31,
    H1 = (Hash bsr 9) band 31,
    H3 = (Hash bsr 14) band 31,
    H4 = (Hash bsr 19) band 31,
    {Slot, H0 bxor H3, H1 bxor H4}.
 get_mask(H0, H1) ->
    case H0 == H1 of
        true ->
            element(H0 + 1, ?TWO_POWER);
        false ->
            element(H0 + 1, ?TWO_POWER) + element(H1 + 1, ?TWO_POWER)
    end.
 add_hashlist([], SlotArray, SlotSplit) ->
    BuildBinFun =
        fun(I, Acc) ->
            Bloom = array:get(I, SlotArray),
            <<Acc/binary, Bloom:32/integer>>
        end,
    lists:foldl(BuildBinFun, <<>>, lists:seq(0, SlotSplit));
 add_hashlist([TopHash|T], SlotArray, SlotSplit) ->
    {Slot, H0, H1} = split_hash(TopHash, SlotSplit),
    Mask = get_mask(H0, H1),
    I = array:get(Slot, SlotArray),
    add_hashlist(T, array:set(Slot, I bor Mask, SlotArray), SlotSplit).
 %%%============================================================================
 %%% Test
 %%%============================================================================
 -ifdef(TEST).
 generate_randomkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) ->
    generate_randomkeys(Seqn,
                        Count,
                        [],
                        BucketRangeLow,
                        BucketRangeHigh).
 generate_randomkeys(_Seqn, 0, Acc, _BucketLow, _BucketHigh) ->
    Acc;
 generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) ->
    BRand = random:uniform(BRange),
    BNumber = string:right(integer_to_list(BucketLow + BRand), 4, $0),
    KNumber = string:right(integer_to_list(random:uniform(10000)), 6, $0),
    LedgerKey = leveled_codec:to_ledgerkey("Bucket" ++ BNumber,
                                            "Key" ++ KNumber,
                                            o),
    {_B, _K, KV, _H} = leveled_codec:generate_ledgerkv(LedgerKey,
                                                        Seqn,
                                                        crypto:rand_bytes(64),
                                                        64,
                                                        infinity),
    generate_randomkeys(Seqn + 1,
                        Count - 1,
                        [KV|Acc],
                        BucketLow,
                        BRange).
 get_hashlist(N) ->
    KVL0 = lists:ukeysort(1, generate_randomkeys(1, N * 2, 1, 20)),
    KVL = lists:sublist(KVL0, N),
    HashFun =
        fun({K, _V}) ->
            leveled_codec:magic_hash(K)
        end,
    lists:map(HashFun, KVL).
 check_all_hashes(BloomBin, HashList) ->
    CheckFun =
        fun(Hash) ->
            ?assertMatch(true, check_hash(Hash, BloomBin))
        end,
    lists:foreach(CheckFun, HashList).
 check_neg_hashes(BloomBin, HashList, Counters) ->
    CheckFun =
        fun(Hash, {AccT, AccF}) ->
            case check_hash(Hash, BloomBin) of
                true ->
                    {AccT + 1, AccF};
                false ->
                    {AccT, AccF + 1}
            end
        end,
    lists:foldl(CheckFun, Counters, HashList).
 bloom_test() ->
    test_bloom(128),
    test_bloom(64),
    test_bloom(32).
 test_bloom(N) ->
    HashList1 = get_hashlist(N),
    HashList2 = get_hashlist(N),
    HashList3 = get_hashlist(N),
    HashList4 = get_hashlist(N),
    SWa = os:timestamp(),
    BloomBin1 = create_bloom(HashList1),
    BloomBin2 = create_bloom(HashList2),
    BloomBin3 = create_bloom(HashList3),
    BloomBin4 = create_bloom(HashList4),
    TSa = timer:now_diff(os:timestamp(), SWa),
    case N of
        128 ->
            ?assertMatch(64, byte_size(BloomBin1)),
            ?assertMatch(64, byte_size(BloomBin2)),
            ?assertMatch(64, byte_size(BloomBin3)),
            ?assertMatch(64, byte_size(BloomBin4));
        _ ->
            ok
    end,
    SWb = os:timestamp(),
    check_all_hashes(BloomBin1, HashList1),
    check_all_hashes(BloomBin2, HashList2),
    check_all_hashes(BloomBin3, HashList3),
    check_all_hashes(BloomBin4, HashList4),
    TSb = timer:now_diff(os:timestamp(), SWb),
    HashPool = get_hashlist(N * 2),
    HashListOut1 = lists:sublist(lists:subtract(HashPool, HashList1), N),
    HashListOut2 = lists:sublist(lists:subtract(HashPool, HashList2), N),
    HashListOut3 = lists:sublist(lists:subtract(HashPool, HashList3), N),
    HashListOut4 = lists:sublist(lists:subtract(HashPool, HashList4), N),
    SWc = os:timestamp(),
    C0 = {0, 0},
    C1 = check_neg_hashes(BloomBin1, HashListOut1, C0),
    C2 = check_neg_hashes(BloomBin2, HashListOut2, C1),
    C3 = check_neg_hashes(BloomBin3, HashListOut3, C2),
    C4 = check_neg_hashes(BloomBin4, HashListOut4, C3),
    {Pos, Neg} = C4,
    FPR = Pos / (Pos + Neg),
    TSc = timer:now_diff(os:timestamp(), SWc),
    io:format(user,
                "Test with size ~w has microsecond timings: -"
                    ++ " build ~w check ~w neg_check ~w and fpr ~w~n",
                [N, TSa, TSb, TSc, FPR]).
 twopower_test() ->
    ?assertMatch(1, element(1, ?TWO_POWER)),
    ?assertMatch(128, element(8, ?TWO_POWER)),
    ?assertMatch(2147483648, element(32, ?TWO_POWER)).
 -endif.