2017-11-24 11:38:58 +00:00
|
|
|
%% -------- TinyBloom ---------
|
|
|
|
%%
|
|
|
|
%% A fixed size bloom that supports 32K keys only, made to try and minimise
|
|
|
|
%% the cost of producing the bloom
|
|
|
|
%%
|
|
|
|
|
|
|
|
|
|
|
|
-module(leveled_ebloom).
|
|
|
|
|
|
|
|
-include("include/leveled.hrl").
|
|
|
|
|
|
|
|
-include_lib("eunit/include/eunit.hrl").
|
|
|
|
|
|
|
|
-export([
|
|
|
|
create_bloom/1,
|
|
|
|
check_hash/2
|
|
|
|
]).
|
|
|
|
|
2017-11-27 14:48:51 +00:00
|
|
|
-define(BLOOM_SIZE_BYTES, 1024).
|
|
|
|
-define(INTEGER_SIZE, 8192).
|
2017-11-24 11:38:58 +00:00
|
|
|
-define(BAND_MASK, ?INTEGER_SIZE - 1).
|
|
|
|
|
|
|
|
|
|
|
|
%%%============================================================================
|
|
|
|
%%% API
|
|
|
|
%%%============================================================================
|
|
|
|
|
|
|
|
-spec create_bloom(list(integer())) -> binary().
|
|
|
|
%% @doc
|
|
|
|
%% Create a binary bloom filter from alist of hashes
|
|
|
|
create_bloom(HashList) ->
|
|
|
|
case length(HashList) of
|
|
|
|
0 ->
|
|
|
|
<<>>;
|
2017-11-27 14:48:51 +00:00
|
|
|
L when L > 8192 ->
|
2017-11-24 11:38:58 +00:00
|
|
|
add_hashlist(HashList,
|
2017-11-27 14:48:51 +00:00
|
|
|
15,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
2017-11-24 11:38:58 +00:00
|
|
|
0, 0, 0, 0, 0, 0, 0, 0);
|
2017-11-27 14:48:51 +00:00
|
|
|
L when L > 4084 ->
|
2017-11-24 11:38:58 +00:00
|
|
|
add_hashlist(HashList, 3, 0, 0, 0, 0);
|
|
|
|
_ ->
|
|
|
|
add_hashlist(HashList, 1, 0, 0)
|
|
|
|
end.
|
|
|
|
|
|
|
|
|
|
|
|
-spec check_hash(integer(), binary()) -> boolean().
|
|
|
|
%% @doc
|
|
|
|
%% Check for the presence of a given hash within a bloom
|
|
|
|
check_hash(_Hash, <<>>) ->
|
|
|
|
false;
|
|
|
|
check_hash({_SegHash, Hash}, BloomBin) ->
|
|
|
|
SlotSplit = (byte_size(BloomBin) div ?BLOOM_SIZE_BYTES) - 1,
|
|
|
|
{Slot, Hashes} = split_hash(Hash, SlotSplit),
|
|
|
|
Mask = get_mask(Hashes),
|
|
|
|
Pos = Slot * ?BLOOM_SIZE_BYTES,
|
|
|
|
IntSize = ?INTEGER_SIZE,
|
|
|
|
<<_H:Pos/binary, CheckInt:IntSize/integer, _T/binary>> = BloomBin,
|
|
|
|
case CheckInt band Mask of
|
|
|
|
Mask ->
|
|
|
|
true;
|
|
|
|
_ ->
|
|
|
|
false
|
|
|
|
end.
|
|
|
|
|
|
|
|
%%%============================================================================
|
|
|
|
%%% Internal Functions
|
|
|
|
%%%============================================================================
|
|
|
|
|
|
|
|
split_hash(Hash, SlotSplit) ->
|
|
|
|
Slot = Hash band SlotSplit,
|
|
|
|
H0 = (Hash bsr 4) band (?BAND_MASK),
|
2017-11-27 14:48:51 +00:00
|
|
|
H1 = (Hash bsr 17) band (?BAND_MASK),
|
|
|
|
{Slot, [H0, H1]}.
|
2017-11-24 11:38:58 +00:00
|
|
|
|
2017-11-27 14:48:51 +00:00
|
|
|
get_mask([H0, H1]) ->
|
|
|
|
(1 bsl H0) bor (1 bsl H1).
|
2017-11-24 11:38:58 +00:00
|
|
|
|
|
|
|
|
|
|
|
%% This looks ugly and clunky, but in tests it was quicker than modifying an
|
|
|
|
%% Erlang term like an array as it is passed around the loop
|
|
|
|
|
|
|
|
add_hashlist([], _S, S0, S1) ->
|
|
|
|
IntSize = ?INTEGER_SIZE,
|
|
|
|
<<S0:IntSize/integer, S1:IntSize/integer>>;
|
|
|
|
add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) ->
|
|
|
|
{Slot, Hashes} = split_hash(TopHash, SlotSplit),
|
|
|
|
Mask = get_mask(Hashes),
|
|
|
|
case Slot of
|
|
|
|
0 ->
|
|
|
|
add_hashlist(T, SlotSplit, S0 bor Mask, S1);
|
|
|
|
1 ->
|
|
|
|
add_hashlist(T, SlotSplit, S0, S1 bor Mask)
|
|
|
|
end.
|
|
|
|
|
|
|
|
add_hashlist([], _S, S0, S1, S2, S3) ->
|
|
|
|
IntSize = ?INTEGER_SIZE,
|
|
|
|
<<S0:IntSize/integer, S1:IntSize/integer,
|
|
|
|
S2:IntSize/integer, S3:IntSize/integer>>;
|
|
|
|
add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) ->
|
|
|
|
{Slot, Hashes} = split_hash(TopHash, SlotSplit),
|
|
|
|
Mask = get_mask(Hashes),
|
|
|
|
case Slot of
|
|
|
|
0 ->
|
|
|
|
add_hashlist(T, SlotSplit, S0 bor Mask, S1, S2, S3);
|
|
|
|
1 ->
|
|
|
|
add_hashlist(T, SlotSplit, S0, S1 bor Mask, S2, S3);
|
|
|
|
2 ->
|
|
|
|
add_hashlist(T, SlotSplit, S0, S1, S2 bor Mask, S3);
|
|
|
|
3 ->
|
|
|
|
add_hashlist(T, SlotSplit, S0, S1, S2, S3 bor Mask)
|
|
|
|
end.
|
|
|
|
|
2017-11-27 14:48:51 +00:00
|
|
|
add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7,
|
|
|
|
S8, S9, S10, S11, S12, S13, S14, S15) ->
|
2017-11-24 11:38:58 +00:00
|
|
|
IntSize = ?INTEGER_SIZE,
|
|
|
|
<<S0:IntSize/integer, S1:IntSize/integer,
|
|
|
|
S2:IntSize/integer, S3:IntSize/integer,
|
|
|
|
S4:IntSize/integer, S5:IntSize/integer,
|
2017-11-27 14:48:51 +00:00
|
|
|
S6:IntSize/integer, S7:IntSize/integer,
|
|
|
|
S8:IntSize/integer, S9:IntSize/integer,
|
|
|
|
S10:IntSize/integer, S11:IntSize/integer,
|
|
|
|
S12:IntSize/integer, S13:IntSize/integer,
|
|
|
|
S14:IntSize/integer, S15:IntSize/integer>>;
|
2017-11-24 11:38:58 +00:00
|
|
|
add_hashlist([{_SegHash, TopHash}|T],
|
|
|
|
SlotSplit,
|
2017-11-27 14:48:51 +00:00
|
|
|
S0, S1, S2, S3, S4, S5, S6, S7,
|
|
|
|
S8, S9, S10, S11, S12, S13, S14, S15) ->
|
2017-11-24 11:38:58 +00:00
|
|
|
{Slot, Hashes} = split_hash(TopHash, SlotSplit),
|
|
|
|
Mask = get_mask(Hashes),
|
|
|
|
case Slot of
|
|
|
|
0 ->
|
|
|
|
add_hashlist(T,
|
|
|
|
SlotSplit,
|
2017-11-27 14:48:51 +00:00
|
|
|
S0 bor Mask, S1, S2, S3, S4, S5, S6, S7,
|
|
|
|
S8, S9, S10, S11, S12, S13, S14, S15);
|
2017-11-24 11:38:58 +00:00
|
|
|
1 ->
|
|
|
|
add_hashlist(T,
|
|
|
|
SlotSplit,
|
2017-11-27 14:48:51 +00:00
|
|
|
S0, S1 bor Mask, S2, S3, S4, S5, S6, S7,
|
|
|
|
S8, S9, S10, S11, S12, S13, S14, S15);
|
2017-11-24 11:38:58 +00:00
|
|
|
2 ->
|
|
|
|
add_hashlist(T,
|
|
|
|
SlotSplit,
|
2017-11-27 14:48:51 +00:00
|
|
|
S0, S1, S2 bor Mask, S3, S4, S5, S6, S7,
|
|
|
|
S8, S9, S10, S11, S12, S13, S14, S15);
|
2017-11-24 11:38:58 +00:00
|
|
|
3 ->
|
|
|
|
add_hashlist(T,
|
|
|
|
SlotSplit,
|
2017-11-27 14:48:51 +00:00
|
|
|
S0, S1, S2, S3 bor Mask, S4, S5, S6, S7,
|
|
|
|
S8, S9, S10, S11, S12, S13, S14, S15);
|
2017-11-24 11:38:58 +00:00
|
|
|
4 ->
|
|
|
|
add_hashlist(T,
|
|
|
|
SlotSplit,
|
2017-11-27 14:48:51 +00:00
|
|
|
S0, S1, S2, S3, S4 bor Mask, S5, S6, S7,
|
|
|
|
S8, S9, S10, S11, S12, S13, S14, S15);
|
2017-11-24 11:38:58 +00:00
|
|
|
5 ->
|
|
|
|
add_hashlist(T,
|
|
|
|
SlotSplit,
|
2017-11-27 14:48:51 +00:00
|
|
|
S0, S1, S2, S3, S4, S5 bor Mask, S6, S7,
|
|
|
|
S8, S9, S10, S11, S12, S13, S14, S15);
|
2017-11-24 11:38:58 +00:00
|
|
|
6 ->
|
|
|
|
add_hashlist(T,
|
|
|
|
SlotSplit,
|
2017-11-27 14:48:51 +00:00
|
|
|
S0, S1, S2, S3, S4, S5, S6 bor Mask, S7,
|
|
|
|
S8, S9, S10, S11, S12, S13, S14, S15);
|
2017-11-24 11:38:58 +00:00
|
|
|
7 ->
|
|
|
|
add_hashlist(T,
|
|
|
|
SlotSplit,
|
2017-11-27 14:48:51 +00:00
|
|
|
S0, S1, S2, S3, S4, S5, S6, S7 bor Mask,
|
|
|
|
S8, S9, S10, S11, S12, S13, S14, S15);
|
|
|
|
8 ->
|
|
|
|
add_hashlist(T,
|
|
|
|
SlotSplit,
|
|
|
|
S0, S1, S2, S3, S4, S5, S6, S7,
|
|
|
|
S8 bor Mask, S9, S10, S11, S12, S13, S14, S15);
|
|
|
|
9 ->
|
|
|
|
add_hashlist(T,
|
|
|
|
SlotSplit,
|
|
|
|
S0, S1, S2, S3, S4, S5, S6, S7,
|
|
|
|
S8, S9 bor Mask, S10, S11, S12, S13, S14, S15);
|
|
|
|
10 ->
|
|
|
|
add_hashlist(T,
|
|
|
|
SlotSplit,
|
|
|
|
S0, S1, S2, S3, S4, S5, S6, S7,
|
|
|
|
S8, S9, S10 bor Mask, S11, S12, S13, S14, S15);
|
|
|
|
11 ->
|
|
|
|
add_hashlist(T,
|
|
|
|
SlotSplit,
|
|
|
|
S0, S1, S2, S3, S4, S5, S6, S7,
|
|
|
|
S8, S9, S10, S11 bor Mask, S12, S13, S14, S15);
|
|
|
|
12 ->
|
|
|
|
add_hashlist(T,
|
|
|
|
SlotSplit,
|
|
|
|
S0, S1, S2, S3, S4, S5, S6, S7,
|
|
|
|
S8, S9, S10, S11, S12 bor Mask, S13, S14, S15);
|
|
|
|
13 ->
|
|
|
|
add_hashlist(T,
|
|
|
|
SlotSplit,
|
|
|
|
S0, S1, S2, S3, S4, S5, S6, S7,
|
|
|
|
S8, S9, S10, S11, S12, S13 bor Mask, S14, S15);
|
|
|
|
14 ->
|
|
|
|
add_hashlist(T,
|
|
|
|
SlotSplit,
|
|
|
|
S0, S1, S2, S3, S4, S5, S6, S7,
|
|
|
|
S8, S9, S10, S11, S12, S13, S14 bor Mask, S15);
|
|
|
|
15 ->
|
|
|
|
add_hashlist(T,
|
|
|
|
SlotSplit,
|
|
|
|
S0, S1, S2, S3, S4, S5, S6, S7,
|
|
|
|
S8, S9, S10, S11, S12, S13, S14, S15 bor Mask)
|
2017-11-24 11:38:58 +00:00
|
|
|
end.
|
|
|
|
|
|
|
|
|
|
|
|
%%%============================================================================
|
|
|
|
%%% Test
|
|
|
|
%%%============================================================================
|
|
|
|
|
|
|
|
-ifdef(TEST).
|
|
|
|
|
|
|
|
generate_orderedkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) ->
|
|
|
|
generate_orderedkeys(Seqn,
|
|
|
|
Count,
|
|
|
|
[],
|
|
|
|
BucketRangeLow,
|
|
|
|
BucketRangeHigh).
|
|
|
|
|
|
|
|
generate_orderedkeys(_Seqn, 0, Acc, _BucketLow, _BucketHigh) ->
|
|
|
|
Acc;
|
|
|
|
generate_orderedkeys(Seqn, Count, Acc, BucketLow, BucketHigh) ->
|
|
|
|
BNumber = Seqn div (BucketHigh - BucketLow),
|
|
|
|
BucketExt = string:right(integer_to_list(BucketLow + BNumber), 4, $0),
|
|
|
|
KNumber = Seqn * 100 + leveled_rand:uniform(100),
|
|
|
|
KeyExt =
|
|
|
|
string:right(integer_to_list(KNumber), 8, $0),
|
|
|
|
LK = leveled_codec:to_ledgerkey("Bucket" ++ BucketExt, "Key" ++ KeyExt, o),
|
|
|
|
Chunk = leveled_rand:rand_bytes(16),
|
|
|
|
{_B, _K, MV, _H, _LMs} =
|
|
|
|
leveled_codec:generate_ledgerkv(LK, Seqn, Chunk, 64, infinity),
|
|
|
|
generate_orderedkeys(Seqn + 1,
|
|
|
|
Count - 1,
|
|
|
|
[{LK, MV}|Acc],
|
|
|
|
BucketLow,
|
|
|
|
BucketHigh).
|
|
|
|
|
|
|
|
|
|
|
|
get_hashlist(N) ->
|
|
|
|
KVL = generate_orderedkeys(1, N, 1, 20),
|
|
|
|
HashFun =
|
|
|
|
fun({K, _V}) ->
|
|
|
|
leveled_codec:segment_hash(K)
|
|
|
|
end,
|
|
|
|
lists:map(HashFun, KVL).
|
|
|
|
|
|
|
|
check_all_hashes(BloomBin, HashList) ->
|
|
|
|
CheckFun =
|
|
|
|
fun(Hash) ->
|
|
|
|
?assertMatch(true, check_hash(Hash, BloomBin))
|
|
|
|
end,
|
|
|
|
lists:foreach(CheckFun, HashList).
|
|
|
|
|
|
|
|
check_neg_hashes(BloomBin, HashList, Counters) ->
|
|
|
|
CheckFun =
|
|
|
|
fun(Hash, {AccT, AccF}) ->
|
|
|
|
case check_hash(Hash, BloomBin) of
|
|
|
|
true ->
|
|
|
|
{AccT + 1, AccF};
|
|
|
|
false ->
|
|
|
|
{AccT, AccF + 1}
|
|
|
|
end
|
|
|
|
end,
|
|
|
|
lists:foldl(CheckFun, Counters, HashList).
|
|
|
|
|
|
|
|
|
|
|
|
empty_bloom_test() ->
|
|
|
|
BloomBin0 = create_bloom([]),
|
|
|
|
?assertMatch({0, 4},
|
|
|
|
check_neg_hashes(BloomBin0, [0, 10, 100, 100000], {0, 0})).
|
|
|
|
|
|
|
|
bloom_test_() ->
|
|
|
|
{timeout, 60, fun bloom_test_ranges/0}.
|
|
|
|
|
|
|
|
bloom_test_ranges() ->
|
|
|
|
test_bloom(40000, 2),
|
2017-11-27 14:48:51 +00:00
|
|
|
test_bloom(128 * 256, 10),
|
2017-11-24 11:38:58 +00:00
|
|
|
test_bloom(20000, 2),
|
|
|
|
test_bloom(10000, 2),
|
|
|
|
test_bloom(5000, 2).
|
|
|
|
|
|
|
|
test_bloom(N, Runs) ->
|
|
|
|
ListOfHashLists =
|
|
|
|
lists:map(fun(_X) -> get_hashlist(N * 2) end, lists:seq(1, Runs)),
|
|
|
|
SpliListFun =
|
|
|
|
fun(HashList) ->
|
|
|
|
HitOrMissFun =
|
|
|
|
fun (Entry, {HitL, MissL}) ->
|
|
|
|
case random:uniform() < 0.5 of
|
|
|
|
true ->
|
|
|
|
{[Entry|HitL], MissL};
|
|
|
|
false ->
|
|
|
|
{HitL, [Entry|MissL]}
|
|
|
|
end
|
|
|
|
end,
|
|
|
|
lists:foldl(HitOrMissFun, {[], []}, HashList)
|
|
|
|
end,
|
|
|
|
SplitListOfHashLists = lists:map(SpliListFun, ListOfHashLists),
|
|
|
|
|
|
|
|
SWa = os:timestamp(),
|
|
|
|
ListOfBlooms =
|
|
|
|
lists:map(fun({HL, _ML}) -> create_bloom(HL) end,
|
|
|
|
SplitListOfHashLists),
|
2017-11-27 14:48:51 +00:00
|
|
|
TSa = timer:now_diff(os:timestamp(), SWa)/Runs,
|
2017-11-24 11:38:58 +00:00
|
|
|
|
|
|
|
SWb = os:timestamp(),
|
|
|
|
lists:foreach(fun(Nth) ->
|
|
|
|
{HL, _ML} = lists:nth(Nth, SplitListOfHashLists),
|
|
|
|
BB = lists:nth(Nth, ListOfBlooms),
|
|
|
|
check_all_hashes(BB, HL)
|
|
|
|
end,
|
|
|
|
lists:seq(1, Runs)),
|
2017-11-27 14:48:51 +00:00
|
|
|
TSb = timer:now_diff(os:timestamp(), SWb)/Runs,
|
2017-11-24 11:38:58 +00:00
|
|
|
|
|
|
|
SWc = os:timestamp(),
|
|
|
|
{Pos, Neg} =
|
|
|
|
lists:foldl(fun(Nth, Acc) ->
|
|
|
|
{_HL, ML} = lists:nth(Nth, SplitListOfHashLists),
|
|
|
|
BB = lists:nth(Nth, ListOfBlooms),
|
|
|
|
check_neg_hashes(BB, ML, Acc)
|
|
|
|
end,
|
|
|
|
{0, 0},
|
|
|
|
lists:seq(1, Runs)),
|
|
|
|
FPR = Pos / (Pos + Neg),
|
2017-11-27 14:48:51 +00:00
|
|
|
TSc = timer:now_diff(os:timestamp(), SWc)/Runs,
|
2017-11-24 11:38:58 +00:00
|
|
|
|
|
|
|
io:format(user,
|
|
|
|
"Test with size ~w has microsecond timings: -"
|
|
|
|
++ " build ~w check ~w neg_check ~w and fpr ~w~n",
|
|
|
|
[N, TSa, TSb, TSc, FPR]).
|
|
|
|
|
|
|
|
|
|
|
|
-endif.
|