leveled/src/leveled_tinybloom.erl
martinsumner e66e326c2b Dialyzer help
Add specs and docs
2017-05-30 16:50:23 +01:00

340 lines
No EOL
12 KiB
Erlang

%% -------- TinyBloom ---------
%%
%% A fixed size bloom that supports 128 keys only, made to try and minimise
%% the cost of producing the bloom
%%
-module(leveled_tinybloom).
-include("include/leveled.hrl").
-include_lib("eunit/include/eunit.hrl").
-export([
create_bloom/1,
check_hash/2
]).
-define(BITS_PER_KEY, 8). % Must be 8 or 4
-define(INTEGER_SIZE, ?BITS_PER_KEY * 8).
-define(BAND_MASK, ?INTEGER_SIZE - 1).
%%%============================================================================
%%% API
%%%============================================================================
-spec create_bloom(list(integer())) -> binary().
%% @doc
%% Create a binary bloom filter from alist of hashes
create_bloom(HashList) ->
case length(HashList) of
0 ->
<<>>;
L when L > 32 ->
add_hashlist(HashList,
15,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0);
L when L > 16 ->
add_hashlist(HashList, 3, 0, 0, 0, 0);
_ ->
add_hashlist(HashList, 1, 0, 0)
end.
-spec check_hash(integer(), binary()) -> boolean().
%% @doc
%% Check for the presence of a given hash within a bloom
check_hash(_Hash, <<>>) ->
false;
check_hash(Hash, BloomBin) ->
SlotSplit = (byte_size(BloomBin) div ?BITS_PER_KEY) - 1,
{Slot, H0, H1} = split_hash(Hash, SlotSplit),
Mask = get_mask(H0, H1),
Pos = Slot * ?BITS_PER_KEY,
IntSize = ?INTEGER_SIZE,
<<_H:Pos/binary, CheckInt:IntSize/integer, _T/binary>> = BloomBin,
case CheckInt band Mask of
Mask ->
true;
_ ->
false
end.
%%%============================================================================
%%% Internal Functions
%%%============================================================================
split_hash(Hash, SlotSplit) ->
Slot = Hash band SlotSplit,
H0 = (Hash bsr 4) band (?BAND_MASK),
H1 = (Hash bsr 10) band (?BAND_MASK),
H3 = (Hash bsr 16) band (?BAND_MASK),
H4 = (Hash bsr 22) band (?BAND_MASK),
Slot0 = (Hash bsr 28) band SlotSplit,
{Slot bxor Slot0, H0 bxor H3, H1 bxor H4}.
get_mask(H0, H1) ->
case H0 == H1 of
true ->
1 bsl H0;
false ->
(1 bsl H0) + (1 bsl H1)
end.
%% This looks ugly and clunky, but in tests it was quicker than modifying an
%% Erlang term like an array as it is passed around the loop
add_hashlist([], _S, S0, S1) ->
IntSize = ?INTEGER_SIZE,
<<S0:IntSize/integer, S1:IntSize/integer>>;
add_hashlist([TopHash|T], SlotSplit, S0, S1) ->
{Slot, H0, H1} = split_hash(TopHash, SlotSplit),
Mask = get_mask(H0, H1),
case Slot of
0 ->
add_hashlist(T, SlotSplit, S0 bor Mask, S1);
1 ->
add_hashlist(T, SlotSplit, S0, S1 bor Mask)
end.
add_hashlist([], _S, S0, S1, S2, S3) ->
IntSize = ?INTEGER_SIZE,
<<S0:IntSize/integer, S1:IntSize/integer,
S2:IntSize/integer, S3:IntSize/integer>>;
add_hashlist([TopHash|T], SlotSplit, S0, S1, S2, S3) ->
{Slot, H0, H1} = split_hash(TopHash, SlotSplit),
Mask = get_mask(H0, H1),
case Slot of
0 ->
add_hashlist(T, SlotSplit, S0 bor Mask, S1, S2, S3);
1 ->
add_hashlist(T, SlotSplit, S0, S1 bor Mask, S2, S3);
2 ->
add_hashlist(T, SlotSplit, S0, S1, S2 bor Mask, S3);
3 ->
add_hashlist(T, SlotSplit, S0, S1, S2, S3 bor Mask)
end.
add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF) ->
IntSize = ?INTEGER_SIZE,
<<S0:IntSize/integer, S1:IntSize/integer,
S2:IntSize/integer, S3:IntSize/integer,
S4:IntSize/integer, S5:IntSize/integer,
S6:IntSize/integer, S7:IntSize/integer,
S8:IntSize/integer, S9:IntSize/integer,
SA:IntSize/integer, SB:IntSize/integer,
SC:IntSize/integer, SD:IntSize/integer,
SE:IntSize/integer, SF:IntSize/integer>>;
add_hashlist([TopHash|T],
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF) ->
{Slot, H0, H1} = split_hash(TopHash, SlotSplit),
Mask = get_mask(H0, H1),
case Slot of
0 ->
add_hashlist(T,
SlotSplit,
S0 bor Mask, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF);
1 ->
add_hashlist(T,
SlotSplit,
S0, S1 bor Mask, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF);
2 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2 bor Mask, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF);
3 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3 bor Mask, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF);
4 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4 bor Mask, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF);
5 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5 bor Mask, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF);
6 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6 bor Mask, S7, S8, S9,
SA, SB, SC, SD, SE, SF);
7 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7 bor Mask, S8, S9,
SA, SB, SC, SD, SE, SF);
8 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8 bor Mask, S9,
SA, SB, SC, SD, SE, SF);
9 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9 bor Mask,
SA, SB, SC, SD, SE, SF);
10 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA bor Mask, SB, SC, SD, SE, SF);
11 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB bor Mask, SC, SD, SE, SF);
12 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC bor Mask, SD, SE, SF);
13 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD bor Mask, SE, SF);
14 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE bor Mask, SF);
15 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF bor Mask)
end.
%%%============================================================================
%%% Test
%%%============================================================================
-ifdef(TEST).
generate_randomkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) ->
generate_randomkeys(Seqn,
Count,
[],
BucketRangeLow,
BucketRangeHigh).
generate_randomkeys(_Seqn, 0, Acc, _BucketLow, _BucketHigh) ->
Acc;
generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) ->
BRand = random:uniform(BRange),
BNumber = string:right(integer_to_list(BucketLow + BRand), 4, $0),
KNumber = string:right(integer_to_list(random:uniform(10000)), 6, $0),
LedgerKey = leveled_codec:to_ledgerkey("Bucket" ++ BNumber,
"Key" ++ KNumber,
o),
{_B, _K, KV, _H} = leveled_codec:generate_ledgerkv(LedgerKey,
Seqn,
crypto:rand_bytes(64),
64,
infinity),
generate_randomkeys(Seqn + 1,
Count - 1,
[KV|Acc],
BucketLow,
BRange).
get_hashlist(N) ->
KVL0 = lists:ukeysort(1, generate_randomkeys(1, N * 2, 1, 20)),
KVL = lists:sublist(KVL0, N),
HashFun =
fun({K, _V}) ->
leveled_codec:magic_hash(K)
end,
lists:map(HashFun, KVL).
check_all_hashes(BloomBin, HashList) ->
CheckFun =
fun(Hash) ->
?assertMatch(true, check_hash(Hash, BloomBin))
end,
lists:foreach(CheckFun, HashList).
check_neg_hashes(BloomBin, HashList, Counters) ->
CheckFun =
fun(Hash, {AccT, AccF}) ->
case check_hash(Hash, BloomBin) of
true ->
{AccT + 1, AccF};
false ->
{AccT, AccF + 1}
end
end,
lists:foldl(CheckFun, Counters, HashList).
empty_bloom_test() ->
BloomBin0 = create_bloom([]),
?assertMatch({0, 4},
check_neg_hashes(BloomBin0, [0, 10, 100, 100000], {0, 0})).
bloom_test() ->
test_bloom(128),
test_bloom(64),
test_bloom(32),
test_bloom(16),
test_bloom(8).
test_bloom(N) ->
HashList1 = get_hashlist(N),
HashList2 = get_hashlist(N),
HashList3 = get_hashlist(N),
HashList4 = get_hashlist(N),
SWa = os:timestamp(),
BloomBin1 = create_bloom(HashList1),
BloomBin2 = create_bloom(HashList2),
BloomBin3 = create_bloom(HashList3),
BloomBin4 = create_bloom(HashList4),
TSa = timer:now_diff(os:timestamp(), SWa),
SWb = os:timestamp(),
check_all_hashes(BloomBin1, HashList1),
check_all_hashes(BloomBin2, HashList2),
check_all_hashes(BloomBin3, HashList3),
check_all_hashes(BloomBin4, HashList4),
TSb = timer:now_diff(os:timestamp(), SWb),
HashPool = get_hashlist(N * 2),
HashListOut1 = lists:sublist(lists:subtract(HashPool, HashList1), N),
HashListOut2 = lists:sublist(lists:subtract(HashPool, HashList2), N),
HashListOut3 = lists:sublist(lists:subtract(HashPool, HashList3), N),
HashListOut4 = lists:sublist(lists:subtract(HashPool, HashList4), N),
SWc = os:timestamp(),
C0 = {0, 0},
C1 = check_neg_hashes(BloomBin1, HashListOut1, C0),
C2 = check_neg_hashes(BloomBin2, HashListOut2, C1),
C3 = check_neg_hashes(BloomBin3, HashListOut3, C2),
C4 = check_neg_hashes(BloomBin4, HashListOut4, C3),
{Pos, Neg} = C4,
FPR = Pos / (Pos + Neg),
TSc = timer:now_diff(os:timestamp(), SWc),
io:format(user,
"Test with size ~w has microsecond timings: -"
++ " build ~w check ~w neg_check ~w and fpr ~w~n",
[N, TSa, TSb, TSc, FPR]).
-endif.