Use lower fpr tinyblooms

... but maybe they're slower?
This commit is contained in:
Martin Sumner 2017-10-24 15:15:15 +01:00
parent 26aa573ce1
commit 3fd5260cd9
2 changed files with 44 additions and 96 deletions

View file

@ -83,12 +83,12 @@
-spec segment_hash(any()) -> {integer(), integer()}. -spec segment_hash(any()) -> {integer(), integer()}.
%% @doc %% @doc
%% Return two 16 bit integers - the segment ID and a second integer for spare %% Return two integers - the segment ID and a second integer for spare
%% entropy. The hashed should be used in blooms or indexes such that some %% entropy. The hashed should be used in blooms or indexes such that some
%% speed can be gained if just the segment ID is known - but more can be %% speed can be gained if just the segment ID is known - but more can be
%% gained should the extended hash (with the second element) is known %% gained should the extended hash (with the second element) is known
segment_hash(Key) when is_binary(Key) -> segment_hash(Key) when is_binary(Key) ->
<<SegmentID:16/integer, ExtraHash:16/integer, _Rest/binary>> = <<SegmentID:16/integer, ExtraHash:32/integer, _Rest/binary>> =
crypto:hash(md5, Key), crypto:hash(md5, Key),
{SegmentID, ExtraHash}; {SegmentID, ExtraHash};
segment_hash(Key) -> segment_hash(Key) ->

View file

@ -16,9 +16,12 @@
check_hash/2 check_hash/2
]). ]).
-define(BITS_PER_KEY, 8). % Must be 8 or 4 -define(BLOOM_SIZE_BITS, 128).
-define(INTEGER_SIZE, ?BITS_PER_KEY * 8). % Size of each bloom in bits
-define(BAND_MASK, ?INTEGER_SIZE - 1). % If hash space is now split into 8 different blooms of this size there
% will be 8 bits per key.
-define(BLOOM_SIZE_BYTES, 16). % Bits divided by 8
-define(BAND_MASK, ?BLOOM_SIZE_BITS - 1).
%%%============================================================================ %%%============================================================================
@ -34,9 +37,8 @@ create_bloom(HashList) ->
<<>>; <<>>;
L when L > 32 -> L when L > 32 ->
add_hashlist(HashList, add_hashlist(HashList,
15, 7,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
0, 0, 0, 0, 0, 0);
L when L > 16 -> L when L > 16 ->
add_hashlist(HashList, 3, 0, 0, 0, 0); add_hashlist(HashList, 3, 0, 0, 0, 0);
_ -> _ ->
@ -49,11 +51,12 @@ create_bloom(HashList) ->
check_hash(_Hash, <<>>) -> check_hash(_Hash, <<>>) ->
false; false;
check_hash({_SegHash, Hash}, BloomBin) -> check_hash({_SegHash, Hash}, BloomBin) ->
SlotSplit = (byte_size(BloomBin) div ?BITS_PER_KEY) - 1, SlotSplit = (byte_size(BloomBin) div ?BLOOM_SIZE_BYTES) - 1,
{Slot, H0, H1} = split_hash(Hash, SlotSplit), {Slot, H0, H1, H2, H3} = split_hash(Hash, SlotSplit),
Mask = get_mask(H0, H1), Mask = get_mask(H0, H1, H2, H3),
Pos = Slot * ?BITS_PER_KEY, IntSize = ?BLOOM_SIZE_BITS,
IntSize = ?INTEGER_SIZE, Pos = Slot * ?BLOOM_SIZE_BYTES,
io:format("Pos ~w SlotSplit ~w BloomSize ~w~n", [Pos, SlotSplit, byte_size(BloomBin)]),
<<_H:Pos/binary, CheckInt:IntSize/integer, _T/binary>> = BloomBin, <<_H:Pos/binary, CheckInt:IntSize/integer, _T/binary>> = BloomBin,
case CheckInt band Mask of case CheckInt band Mask of
Mask -> Mask ->
@ -69,27 +72,26 @@ check_hash({_SegHash, Hash}, BloomBin) ->
split_hash(Hash, SlotSplit) -> split_hash(Hash, SlotSplit) ->
Slot = Hash band SlotSplit, Slot = Hash band SlotSplit,
H0 = (Hash bsr 4) band (?BAND_MASK), H0 = (Hash bsr 4) band (?BAND_MASK),
H1 = (Hash bsr 10) band (?BAND_MASK), H1 = (Hash bsr 11) band (?BAND_MASK),
{Slot, H0, H1}. H2 = (Hash bsr 18) band (?BAND_MASK),
H3 = (Hash bsr 25) band (?BAND_MASK),
{Slot, H0, H1, H2, H3}.
get_mask(H0, H1) -> get_mask(H0, H1, H2, H3) ->
case H0 == H1 of lists:foldl(fun(H, Acc) -> Acc + (1 bsl H) end,
true -> 0,
1 bsl H0; lists:usort([H0, H1, H2, H3])).
false ->
(1 bsl H0) + (1 bsl H1)
end.
%% This looks ugly and clunky, but in tests it was quicker than modifying an %% This looks ugly and clunky, but in tests it was quicker than modifying an
%% Erlang term like an array as it is passed around the loop %% Erlang term like an array as it is passed around the loop
add_hashlist([], _S, S0, S1) -> add_hashlist([], _S, S0, S1) ->
IntSize = ?INTEGER_SIZE, IntSize = ?BLOOM_SIZE_BITS,
<<S0:IntSize/integer, S1:IntSize/integer>>; <<S0:IntSize/integer, S1:IntSize/integer>>;
add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) -> add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) ->
{Slot, H0, H1} = split_hash(TopHash, SlotSplit), {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit),
Mask = get_mask(H0, H1), Mask = get_mask(H0, H1, H2, H3),
case Slot of case Slot of
0 -> 0 ->
add_hashlist(T, SlotSplit, S0 bor Mask, S1); add_hashlist(T, SlotSplit, S0 bor Mask, S1);
@ -98,12 +100,12 @@ add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) ->
end. end.
add_hashlist([], _S, S0, S1, S2, S3) -> add_hashlist([], _S, S0, S1, S2, S3) ->
IntSize = ?INTEGER_SIZE, IntSize = ?BLOOM_SIZE_BITS,
<<S0:IntSize/integer, S1:IntSize/integer, <<S0:IntSize/integer, S1:IntSize/integer,
S2:IntSize/integer, S3:IntSize/integer>>; S2:IntSize/integer, S3:IntSize/integer>>;
add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) -> add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) ->
{Slot, H0, H1} = split_hash(TopHash, SlotSplit), {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit),
Mask = get_mask(H0, H1), Mask = get_mask(H0, H1, H2, H3),
case Slot of case Slot of
0 -> 0 ->
add_hashlist(T, SlotSplit, S0 bor Mask, S1, S2, S3); add_hashlist(T, SlotSplit, S0 bor Mask, S1, S2, S3);
@ -115,104 +117,50 @@ add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) ->
add_hashlist(T, SlotSplit, S0, S1, S2, S3 bor Mask) add_hashlist(T, SlotSplit, S0, S1, S2, S3 bor Mask)
end. end.
add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7) ->
SA, SB, SC, SD, SE, SF) -> IntSize = ?BLOOM_SIZE_BITS,
IntSize = ?INTEGER_SIZE,
<<S0:IntSize/integer, S1:IntSize/integer, <<S0:IntSize/integer, S1:IntSize/integer,
S2:IntSize/integer, S3:IntSize/integer, S2:IntSize/integer, S3:IntSize/integer,
S4:IntSize/integer, S5:IntSize/integer, S4:IntSize/integer, S5:IntSize/integer,
S6:IntSize/integer, S7:IntSize/integer, S6:IntSize/integer, S7:IntSize/integer>>;
S8:IntSize/integer, S9:IntSize/integer,
SA:IntSize/integer, SB:IntSize/integer,
SC:IntSize/integer, SD:IntSize/integer,
SE:IntSize/integer, SF:IntSize/integer>>;
add_hashlist([{_SegHash, TopHash}|T], add_hashlist([{_SegHash, TopHash}|T],
SlotSplit, SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S0, S1, S2, S3, S4, S5, S6, S7) ->
SA, SB, SC, SD, SE, SF) -> {Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit),
{Slot, H0, H1} = split_hash(TopHash, SlotSplit), Mask = get_mask(H0, H1, H2, H3),
Mask = get_mask(H0, H1),
case Slot of case Slot of
0 -> 0 ->
add_hashlist(T, add_hashlist(T,
SlotSplit, SlotSplit,
S0 bor Mask, S1, S2, S3, S4, S5, S6, S7, S8, S9, S0 bor Mask, S1, S2, S3, S4, S5, S6, S7);
SA, SB, SC, SD, SE, SF);
1 -> 1 ->
add_hashlist(T, add_hashlist(T,
SlotSplit, SlotSplit,
S0, S1 bor Mask, S2, S3, S4, S5, S6, S7, S8, S9, S0, S1 bor Mask, S2, S3, S4, S5, S6, S7);
SA, SB, SC, SD, SE, SF);
2 -> 2 ->
add_hashlist(T, add_hashlist(T,
SlotSplit, SlotSplit,
S0, S1, S2 bor Mask, S3, S4, S5, S6, S7, S8, S9, S0, S1, S2 bor Mask, S3, S4, S5, S6, S7);
SA, SB, SC, SD, SE, SF);
3 -> 3 ->
add_hashlist(T, add_hashlist(T,
SlotSplit, SlotSplit,
S0, S1, S2, S3 bor Mask, S4, S5, S6, S7, S8, S9, S0, S1, S2, S3 bor Mask, S4, S5, S6, S7);
SA, SB, SC, SD, SE, SF);
4 -> 4 ->
add_hashlist(T, add_hashlist(T,
SlotSplit, SlotSplit,
S0, S1, S2, S3, S4 bor Mask, S5, S6, S7, S8, S9, S0, S1, S2, S3, S4 bor Mask, S5, S6, S7);
SA, SB, SC, SD, SE, SF);
5 -> 5 ->
add_hashlist(T, add_hashlist(T,
SlotSplit, SlotSplit,
S0, S1, S2, S3, S4, S5 bor Mask, S6, S7, S8, S9, S0, S1, S2, S3, S4, S5 bor Mask, S6, S7);
SA, SB, SC, SD, SE, SF);
6 -> 6 ->
add_hashlist(T, add_hashlist(T,
SlotSplit, SlotSplit,
S0, S1, S2, S3, S4, S5, S6 bor Mask, S7, S8, S9, S0, S1, S2, S3, S4, S5, S6 bor Mask, S7);
SA, SB, SC, SD, SE, SF);
7 -> 7 ->
add_hashlist(T, add_hashlist(T,
SlotSplit, SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7 bor Mask, S8, S9, S0, S1, S2, S3, S4, S5, S6, S7 bor Mask)
SA, SB, SC, SD, SE, SF);
8 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8 bor Mask, S9,
SA, SB, SC, SD, SE, SF);
9 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9 bor Mask,
SA, SB, SC, SD, SE, SF);
10 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA bor Mask, SB, SC, SD, SE, SF);
11 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB bor Mask, SC, SD, SE, SF);
12 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC bor Mask, SD, SE, SF);
13 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD bor Mask, SE, SF);
14 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE bor Mask, SF);
15 ->
add_hashlist(T,
SlotSplit,
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
SA, SB, SC, SD, SE, SF bor Mask)
end. end.