Add alternative bloom
Bloom filter that can take largere keys but is still efficient to build. Allows bloom filter to be checked without first detemrining the slot. Also, as represents the whole SST - it could be sent to the penciller to remove the need for a message pass. The bloom is smaller and has a worse fpr than leveled_tinybloom. Failing the bloom check isn't so bad - due to the slot index check being relatively fast and having a very low fpr.
This commit is contained in:
parent
0e51627539
commit
c65dfa31d8
2 changed files with 295 additions and 1 deletions
294
src/leveled_ebloom.erl
Normal file
294
src/leveled_ebloom.erl
Normal file
|
@ -0,0 +1,294 @@
|
|||
%% -------- TinyBloom ---------
|
||||
%%
|
||||
%% A fixed size bloom that supports 32K keys only, made to try and minimise
|
||||
%% the cost of producing the bloom
|
||||
%%
|
||||
|
||||
|
||||
-module(leveled_ebloom).
|
||||
|
||||
-include("include/leveled.hrl").
|
||||
|
||||
-include_lib("eunit/include/eunit.hrl").
|
||||
|
||||
-export([
|
||||
create_bloom/1,
|
||||
check_hash/2
|
||||
]).
|
||||
|
||||
-define(BLOOM_SIZE_BYTES, 2048).
|
||||
-define(INTEGER_SIZE, 16384).
|
||||
-define(BAND_MASK, ?INTEGER_SIZE - 1).
|
||||
|
||||
|
||||
%%%============================================================================
|
||||
%%% API
|
||||
%%%============================================================================
|
||||
|
||||
-spec create_bloom(list(integer())) -> binary().
|
||||
%% @doc
|
||||
%% Create a binary bloom filter from alist of hashes
|
||||
create_bloom(HashList) ->
|
||||
case length(HashList) of
|
||||
0 ->
|
||||
<<>>;
|
||||
L when L > 16384 ->
|
||||
add_hashlist(HashList,
|
||||
7,
|
||||
0, 0, 0, 0, 0, 0, 0, 0);
|
||||
L when L > 8192 ->
|
||||
add_hashlist(HashList, 3, 0, 0, 0, 0);
|
||||
_ ->
|
||||
add_hashlist(HashList, 1, 0, 0)
|
||||
end.
|
||||
|
||||
|
||||
-spec check_hash(integer(), binary()) -> boolean().
|
||||
%% @doc
|
||||
%% Check for the presence of a given hash within a bloom
|
||||
check_hash(_Hash, <<>>) ->
|
||||
false;
|
||||
check_hash({_SegHash, Hash}, BloomBin) ->
|
||||
SlotSplit = (byte_size(BloomBin) div ?BLOOM_SIZE_BYTES) - 1,
|
||||
{Slot, Hashes} = split_hash(Hash, SlotSplit),
|
||||
Mask = get_mask(Hashes),
|
||||
Pos = Slot * ?BLOOM_SIZE_BYTES,
|
||||
IntSize = ?INTEGER_SIZE,
|
||||
<<_H:Pos/binary, CheckInt:IntSize/integer, _T/binary>> = BloomBin,
|
||||
case CheckInt band Mask of
|
||||
Mask ->
|
||||
true;
|
||||
_ ->
|
||||
false
|
||||
end.
|
||||
|
||||
%%%============================================================================
|
||||
%%% Internal Functions
|
||||
%%%============================================================================
|
||||
|
||||
split_hash(Hash, SlotSplit) ->
|
||||
Slot = Hash band SlotSplit,
|
||||
H0 = (Hash bsr 4) band (?BAND_MASK),
|
||||
H1 = (Hash bsr 18) band (?BAND_MASK),
|
||||
% H2 = (Hash bsr 34) band (?BAND_MASK),
|
||||
% H3 = (Hash bsr 49) band (?BAND_MASK),
|
||||
{Slot, [H0, H1
|
||||
%, H2, H3
|
||||
]}.
|
||||
|
||||
get_mask([H0, H1
|
||||
%, H2, H3
|
||||
]) ->
|
||||
(1 bsl H0) bor (1 bsl H1)
|
||||
% bor (1 bsl H2) bor (1 bsl H3)
|
||||
.
|
||||
|
||||
|
||||
%% This looks ugly and clunky, but in tests it was quicker than modifying an
|
||||
%% Erlang term like an array as it is passed around the loop
|
||||
|
||||
add_hashlist([], _S, S0, S1) ->
|
||||
IntSize = ?INTEGER_SIZE,
|
||||
<<S0:IntSize/integer, S1:IntSize/integer>>;
|
||||
add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) ->
|
||||
{Slot, Hashes} = split_hash(TopHash, SlotSplit),
|
||||
Mask = get_mask(Hashes),
|
||||
case Slot of
|
||||
0 ->
|
||||
add_hashlist(T, SlotSplit, S0 bor Mask, S1);
|
||||
1 ->
|
||||
add_hashlist(T, SlotSplit, S0, S1 bor Mask)
|
||||
end.
|
||||
|
||||
add_hashlist([], _S, S0, S1, S2, S3) ->
|
||||
IntSize = ?INTEGER_SIZE,
|
||||
<<S0:IntSize/integer, S1:IntSize/integer,
|
||||
S2:IntSize/integer, S3:IntSize/integer>>;
|
||||
add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) ->
|
||||
{Slot, Hashes} = split_hash(TopHash, SlotSplit),
|
||||
Mask = get_mask(Hashes),
|
||||
case Slot of
|
||||
0 ->
|
||||
add_hashlist(T, SlotSplit, S0 bor Mask, S1, S2, S3);
|
||||
1 ->
|
||||
add_hashlist(T, SlotSplit, S0, S1 bor Mask, S2, S3);
|
||||
2 ->
|
||||
add_hashlist(T, SlotSplit, S0, S1, S2 bor Mask, S3);
|
||||
3 ->
|
||||
add_hashlist(T, SlotSplit, S0, S1, S2, S3 bor Mask)
|
||||
end.
|
||||
|
||||
add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7) ->
|
||||
IntSize = ?INTEGER_SIZE,
|
||||
<<S0:IntSize/integer, S1:IntSize/integer,
|
||||
S2:IntSize/integer, S3:IntSize/integer,
|
||||
S4:IntSize/integer, S5:IntSize/integer,
|
||||
S6:IntSize/integer, S7:IntSize/integer>>;
|
||||
add_hashlist([{_SegHash, TopHash}|T],
|
||||
SlotSplit,
|
||||
S0, S1, S2, S3, S4, S5, S6, S7) ->
|
||||
{Slot, Hashes} = split_hash(TopHash, SlotSplit),
|
||||
Mask = get_mask(Hashes),
|
||||
case Slot of
|
||||
0 ->
|
||||
add_hashlist(T,
|
||||
SlotSplit,
|
||||
S0 bor Mask, S1, S2, S3, S4, S5, S6, S7);
|
||||
1 ->
|
||||
add_hashlist(T,
|
||||
SlotSplit,
|
||||
S0, S1 bor Mask, S2, S3, S4, S5, S6, S7);
|
||||
2 ->
|
||||
add_hashlist(T,
|
||||
SlotSplit,
|
||||
S0, S1, S2 bor Mask, S3, S4, S5, S6, S7);
|
||||
3 ->
|
||||
add_hashlist(T,
|
||||
SlotSplit,
|
||||
S0, S1, S2, S3 bor Mask, S4, S5, S6, S7);
|
||||
4 ->
|
||||
add_hashlist(T,
|
||||
SlotSplit,
|
||||
S0, S1, S2, S3, S4 bor Mask, S5, S6, S7);
|
||||
5 ->
|
||||
add_hashlist(T,
|
||||
SlotSplit,
|
||||
S0, S1, S2, S3, S4, S5 bor Mask, S6, S7);
|
||||
6 ->
|
||||
add_hashlist(T,
|
||||
SlotSplit,
|
||||
S0, S1, S2, S3, S4, S5, S6 bor Mask, S7);
|
||||
7 ->
|
||||
add_hashlist(T,
|
||||
SlotSplit,
|
||||
S0, S1, S2, S3, S4, S5, S6, S7 bor Mask)
|
||||
end.
|
||||
|
||||
|
||||
%%%============================================================================
|
||||
%%% Test
|
||||
%%%============================================================================
|
||||
|
||||
-ifdef(TEST).
|
||||
|
||||
generate_orderedkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) ->
|
||||
generate_orderedkeys(Seqn,
|
||||
Count,
|
||||
[],
|
||||
BucketRangeLow,
|
||||
BucketRangeHigh).
|
||||
|
||||
generate_orderedkeys(_Seqn, 0, Acc, _BucketLow, _BucketHigh) ->
|
||||
Acc;
|
||||
generate_orderedkeys(Seqn, Count, Acc, BucketLow, BucketHigh) ->
|
||||
BNumber = Seqn div (BucketHigh - BucketLow),
|
||||
BucketExt = string:right(integer_to_list(BucketLow + BNumber), 4, $0),
|
||||
KNumber = Seqn * 100 + leveled_rand:uniform(100),
|
||||
KeyExt =
|
||||
string:right(integer_to_list(KNumber), 8, $0),
|
||||
LK = leveled_codec:to_ledgerkey("Bucket" ++ BucketExt, "Key" ++ KeyExt, o),
|
||||
Chunk = leveled_rand:rand_bytes(16),
|
||||
{_B, _K, MV, _H, _LMs} =
|
||||
leveled_codec:generate_ledgerkv(LK, Seqn, Chunk, 64, infinity),
|
||||
generate_orderedkeys(Seqn + 1,
|
||||
Count - 1,
|
||||
[{LK, MV}|Acc],
|
||||
BucketLow,
|
||||
BucketHigh).
|
||||
|
||||
|
||||
get_hashlist(N) ->
|
||||
KVL = generate_orderedkeys(1, N, 1, 20),
|
||||
HashFun =
|
||||
fun({K, _V}) ->
|
||||
leveled_codec:segment_hash(K)
|
||||
end,
|
||||
lists:map(HashFun, KVL).
|
||||
|
||||
check_all_hashes(BloomBin, HashList) ->
|
||||
CheckFun =
|
||||
fun(Hash) ->
|
||||
?assertMatch(true, check_hash(Hash, BloomBin))
|
||||
end,
|
||||
lists:foreach(CheckFun, HashList).
|
||||
|
||||
check_neg_hashes(BloomBin, HashList, Counters) ->
|
||||
CheckFun =
|
||||
fun(Hash, {AccT, AccF}) ->
|
||||
case check_hash(Hash, BloomBin) of
|
||||
true ->
|
||||
{AccT + 1, AccF};
|
||||
false ->
|
||||
{AccT, AccF + 1}
|
||||
end
|
||||
end,
|
||||
lists:foldl(CheckFun, Counters, HashList).
|
||||
|
||||
|
||||
empty_bloom_test() ->
|
||||
BloomBin0 = create_bloom([]),
|
||||
?assertMatch({0, 4},
|
||||
check_neg_hashes(BloomBin0, [0, 10, 100, 100000], {0, 0})).
|
||||
|
||||
bloom_test_() ->
|
||||
{timeout, 60, fun bloom_test_ranges/0}.
|
||||
|
||||
bloom_test_ranges() ->
|
||||
test_bloom(40000, 2),
|
||||
test_bloom(?INTEGER_SIZE, 10),
|
||||
test_bloom(20000, 2),
|
||||
test_bloom(10000, 2),
|
||||
test_bloom(5000, 2).
|
||||
|
||||
test_bloom(N, Runs) ->
|
||||
ListOfHashLists =
|
||||
lists:map(fun(_X) -> get_hashlist(N * 2) end, lists:seq(1, Runs)),
|
||||
SpliListFun =
|
||||
fun(HashList) ->
|
||||
HitOrMissFun =
|
||||
fun (Entry, {HitL, MissL}) ->
|
||||
case random:uniform() < 0.5 of
|
||||
true ->
|
||||
{[Entry|HitL], MissL};
|
||||
false ->
|
||||
{HitL, [Entry|MissL]}
|
||||
end
|
||||
end,
|
||||
lists:foldl(HitOrMissFun, {[], []}, HashList)
|
||||
end,
|
||||
SplitListOfHashLists = lists:map(SpliListFun, ListOfHashLists),
|
||||
|
||||
SWa = os:timestamp(),
|
||||
ListOfBlooms =
|
||||
lists:map(fun({HL, _ML}) -> create_bloom(HL) end,
|
||||
SplitListOfHashLists),
|
||||
TSa = timer:now_diff(os:timestamp(), SWa),
|
||||
|
||||
SWb = os:timestamp(),
|
||||
lists:foreach(fun(Nth) ->
|
||||
{HL, _ML} = lists:nth(Nth, SplitListOfHashLists),
|
||||
BB = lists:nth(Nth, ListOfBlooms),
|
||||
check_all_hashes(BB, HL)
|
||||
end,
|
||||
lists:seq(1, Runs)),
|
||||
TSb = timer:now_diff(os:timestamp(), SWb),
|
||||
|
||||
SWc = os:timestamp(),
|
||||
{Pos, Neg} =
|
||||
lists:foldl(fun(Nth, Acc) ->
|
||||
{_HL, ML} = lists:nth(Nth, SplitListOfHashLists),
|
||||
BB = lists:nth(Nth, ListOfBlooms),
|
||||
check_neg_hashes(BB, ML, Acc)
|
||||
end,
|
||||
{0, 0},
|
||||
lists:seq(1, Runs)),
|
||||
FPR = Pos / (Pos + Neg),
|
||||
TSc = timer:now_diff(os:timestamp(), SWc),
|
||||
|
||||
io:format(user,
|
||||
"Test with size ~w has microsecond timings: -"
|
||||
++ " build ~w check ~w neg_check ~w and fpr ~w~n",
|
||||
[N, TSa, TSb, TSc, FPR]).
|
||||
|
||||
|
||||
-endif.
|
|
@ -226,7 +226,7 @@ bloom_test_() ->
|
|||
{timeout, 20, fun bloom_test_ranges/0}.
|
||||
|
||||
bloom_test_ranges() ->
|
||||
test_bloom(128, 2000),
|
||||
test_bloom(128, 256),
|
||||
test_bloom(64, 100),
|
||||
test_bloom(32, 100),
|
||||
test_bloom(16, 100),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue