From c65dfa31d8b1833422fc8700f68c0961d34cd964 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Fri, 24 Nov 2017 11:38:58 +0000 Subject: [PATCH] Add alternative bloom Bloom filter that can take largere keys but is still efficient to build. Allows bloom filter to be checked without first detemrining the slot. Also, as represents the whole SST - it could be sent to the penciller to remove the need for a message pass. The bloom is smaller and has a worse fpr than leveled_tinybloom. Failing the bloom check isn't so bad - due to the slot index check being relatively fast and having a very low fpr. --- src/leveled_ebloom.erl | 294 ++++++++++++++++++++++++++++++++++++++ src/leveled_tinybloom.erl | 2 +- 2 files changed, 295 insertions(+), 1 deletion(-) create mode 100644 src/leveled_ebloom.erl diff --git a/src/leveled_ebloom.erl b/src/leveled_ebloom.erl new file mode 100644 index 0000000..d2edc4c --- /dev/null +++ b/src/leveled_ebloom.erl @@ -0,0 +1,294 @@ +%% -------- TinyBloom --------- +%% +%% A fixed size bloom that supports 32K keys only, made to try and minimise +%% the cost of producing the bloom +%% + + +-module(leveled_ebloom). + +-include("include/leveled.hrl"). + +-include_lib("eunit/include/eunit.hrl"). + +-export([ + create_bloom/1, + check_hash/2 + ]). + +-define(BLOOM_SIZE_BYTES, 2048). +-define(INTEGER_SIZE, 16384). +-define(BAND_MASK, ?INTEGER_SIZE - 1). + + +%%%============================================================================ +%%% API +%%%============================================================================ + +-spec create_bloom(list(integer())) -> binary(). +%% @doc +%% Create a binary bloom filter from alist of hashes +create_bloom(HashList) -> + case length(HashList) of + 0 -> + <<>>; + L when L > 16384 -> + add_hashlist(HashList, + 7, + 0, 0, 0, 0, 0, 0, 0, 0); + L when L > 8192 -> + add_hashlist(HashList, 3, 0, 0, 0, 0); + _ -> + add_hashlist(HashList, 1, 0, 0) + end. + + +-spec check_hash(integer(), binary()) -> boolean(). +%% @doc +%% Check for the presence of a given hash within a bloom +check_hash(_Hash, <<>>) -> + false; +check_hash({_SegHash, Hash}, BloomBin) -> + SlotSplit = (byte_size(BloomBin) div ?BLOOM_SIZE_BYTES) - 1, + {Slot, Hashes} = split_hash(Hash, SlotSplit), + Mask = get_mask(Hashes), + Pos = Slot * ?BLOOM_SIZE_BYTES, + IntSize = ?INTEGER_SIZE, + <<_H:Pos/binary, CheckInt:IntSize/integer, _T/binary>> = BloomBin, + case CheckInt band Mask of + Mask -> + true; + _ -> + false + end. + +%%%============================================================================ +%%% Internal Functions +%%%============================================================================ + +split_hash(Hash, SlotSplit) -> + Slot = Hash band SlotSplit, + H0 = (Hash bsr 4) band (?BAND_MASK), + H1 = (Hash bsr 18) band (?BAND_MASK), + % H2 = (Hash bsr 34) band (?BAND_MASK), + % H3 = (Hash bsr 49) band (?BAND_MASK), + {Slot, [H0, H1 + %, H2, H3 + ]}. + +get_mask([H0, H1 + %, H2, H3 + ]) -> + (1 bsl H0) bor (1 bsl H1) + % bor (1 bsl H2) bor (1 bsl H3) + . + + +%% This looks ugly and clunky, but in tests it was quicker than modifying an +%% Erlang term like an array as it is passed around the loop + +add_hashlist([], _S, S0, S1) -> + IntSize = ?INTEGER_SIZE, + <>; +add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) -> + {Slot, Hashes} = split_hash(TopHash, SlotSplit), + Mask = get_mask(Hashes), + case Slot of + 0 -> + add_hashlist(T, SlotSplit, S0 bor Mask, S1); + 1 -> + add_hashlist(T, SlotSplit, S0, S1 bor Mask) + end. + +add_hashlist([], _S, S0, S1, S2, S3) -> + IntSize = ?INTEGER_SIZE, + <>; +add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) -> + {Slot, Hashes} = split_hash(TopHash, SlotSplit), + Mask = get_mask(Hashes), + case Slot of + 0 -> + add_hashlist(T, SlotSplit, S0 bor Mask, S1, S2, S3); + 1 -> + add_hashlist(T, SlotSplit, S0, S1 bor Mask, S2, S3); + 2 -> + add_hashlist(T, SlotSplit, S0, S1, S2 bor Mask, S3); + 3 -> + add_hashlist(T, SlotSplit, S0, S1, S2, S3 bor Mask) + end. + +add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7) -> + IntSize = ?INTEGER_SIZE, + <>; +add_hashlist([{_SegHash, TopHash}|T], + SlotSplit, + S0, S1, S2, S3, S4, S5, S6, S7) -> + {Slot, Hashes} = split_hash(TopHash, SlotSplit), + Mask = get_mask(Hashes), + case Slot of + 0 -> + add_hashlist(T, + SlotSplit, + S0 bor Mask, S1, S2, S3, S4, S5, S6, S7); + 1 -> + add_hashlist(T, + SlotSplit, + S0, S1 bor Mask, S2, S3, S4, S5, S6, S7); + 2 -> + add_hashlist(T, + SlotSplit, + S0, S1, S2 bor Mask, S3, S4, S5, S6, S7); + 3 -> + add_hashlist(T, + SlotSplit, + S0, S1, S2, S3 bor Mask, S4, S5, S6, S7); + 4 -> + add_hashlist(T, + SlotSplit, + S0, S1, S2, S3, S4 bor Mask, S5, S6, S7); + 5 -> + add_hashlist(T, + SlotSplit, + S0, S1, S2, S3, S4, S5 bor Mask, S6, S7); + 6 -> + add_hashlist(T, + SlotSplit, + S0, S1, S2, S3, S4, S5, S6 bor Mask, S7); + 7 -> + add_hashlist(T, + SlotSplit, + S0, S1, S2, S3, S4, S5, S6, S7 bor Mask) + end. + + +%%%============================================================================ +%%% Test +%%%============================================================================ + +-ifdef(TEST). + +generate_orderedkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) -> + generate_orderedkeys(Seqn, + Count, + [], + BucketRangeLow, + BucketRangeHigh). + +generate_orderedkeys(_Seqn, 0, Acc, _BucketLow, _BucketHigh) -> + Acc; +generate_orderedkeys(Seqn, Count, Acc, BucketLow, BucketHigh) -> + BNumber = Seqn div (BucketHigh - BucketLow), + BucketExt = string:right(integer_to_list(BucketLow + BNumber), 4, $0), + KNumber = Seqn * 100 + leveled_rand:uniform(100), + KeyExt = + string:right(integer_to_list(KNumber), 8, $0), + LK = leveled_codec:to_ledgerkey("Bucket" ++ BucketExt, "Key" ++ KeyExt, o), + Chunk = leveled_rand:rand_bytes(16), + {_B, _K, MV, _H, _LMs} = + leveled_codec:generate_ledgerkv(LK, Seqn, Chunk, 64, infinity), + generate_orderedkeys(Seqn + 1, + Count - 1, + [{LK, MV}|Acc], + BucketLow, + BucketHigh). + + +get_hashlist(N) -> + KVL = generate_orderedkeys(1, N, 1, 20), + HashFun = + fun({K, _V}) -> + leveled_codec:segment_hash(K) + end, + lists:map(HashFun, KVL). + +check_all_hashes(BloomBin, HashList) -> + CheckFun = + fun(Hash) -> + ?assertMatch(true, check_hash(Hash, BloomBin)) + end, + lists:foreach(CheckFun, HashList). + +check_neg_hashes(BloomBin, HashList, Counters) -> + CheckFun = + fun(Hash, {AccT, AccF}) -> + case check_hash(Hash, BloomBin) of + true -> + {AccT + 1, AccF}; + false -> + {AccT, AccF + 1} + end + end, + lists:foldl(CheckFun, Counters, HashList). + + +empty_bloom_test() -> + BloomBin0 = create_bloom([]), + ?assertMatch({0, 4}, + check_neg_hashes(BloomBin0, [0, 10, 100, 100000], {0, 0})). + +bloom_test_() -> + {timeout, 60, fun bloom_test_ranges/0}. + +bloom_test_ranges() -> + test_bloom(40000, 2), + test_bloom(?INTEGER_SIZE, 10), + test_bloom(20000, 2), + test_bloom(10000, 2), + test_bloom(5000, 2). + +test_bloom(N, Runs) -> + ListOfHashLists = + lists:map(fun(_X) -> get_hashlist(N * 2) end, lists:seq(1, Runs)), + SpliListFun = + fun(HashList) -> + HitOrMissFun = + fun (Entry, {HitL, MissL}) -> + case random:uniform() < 0.5 of + true -> + {[Entry|HitL], MissL}; + false -> + {HitL, [Entry|MissL]} + end + end, + lists:foldl(HitOrMissFun, {[], []}, HashList) + end, + SplitListOfHashLists = lists:map(SpliListFun, ListOfHashLists), + + SWa = os:timestamp(), + ListOfBlooms = + lists:map(fun({HL, _ML}) -> create_bloom(HL) end, + SplitListOfHashLists), + TSa = timer:now_diff(os:timestamp(), SWa), + + SWb = os:timestamp(), + lists:foreach(fun(Nth) -> + {HL, _ML} = lists:nth(Nth, SplitListOfHashLists), + BB = lists:nth(Nth, ListOfBlooms), + check_all_hashes(BB, HL) + end, + lists:seq(1, Runs)), + TSb = timer:now_diff(os:timestamp(), SWb), + + SWc = os:timestamp(), + {Pos, Neg} = + lists:foldl(fun(Nth, Acc) -> + {_HL, ML} = lists:nth(Nth, SplitListOfHashLists), + BB = lists:nth(Nth, ListOfBlooms), + check_neg_hashes(BB, ML, Acc) + end, + {0, 0}, + lists:seq(1, Runs)), + FPR = Pos / (Pos + Neg), + TSc = timer:now_diff(os:timestamp(), SWc), + + io:format(user, + "Test with size ~w has microsecond timings: -" + ++ " build ~w check ~w neg_check ~w and fpr ~w~n", + [N, TSa, TSb, TSc, FPR]). + + +-endif. diff --git a/src/leveled_tinybloom.erl b/src/leveled_tinybloom.erl index 5513d31..88d1e12 100644 --- a/src/leveled_tinybloom.erl +++ b/src/leveled_tinybloom.erl @@ -226,7 +226,7 @@ bloom_test_() -> {timeout, 20, fun bloom_test_ranges/0}. bloom_test_ranges() -> - test_bloom(128, 2000), + test_bloom(128, 256), test_bloom(64, 100), test_bloom(32, 100), test_bloom(16, 100),