Use more keys in bloom
Use 4 keys in the bloom (which is closer to optimal size). This should halve the fpr - as we cna now use the large ExtraHash rather than being constrained by the SegmentHash here.
This commit is contained in:
parent
f08faf6432
commit
6af1d3b003
2 changed files with 31 additions and 86 deletions
|
@ -88,7 +88,7 @@
|
||||||
%% speed can be gained if just the segment ID is known - but more can be
|
%% speed can be gained if just the segment ID is known - but more can be
|
||||||
%% gained should the extended hash (with the second element) is known
|
%% gained should the extended hash (with the second element) is known
|
||||||
segment_hash(Key) when is_binary(Key) ->
|
segment_hash(Key) when is_binary(Key) ->
|
||||||
<<SegmentID:16/integer, ExtraHash:16/integer, _Rest/binary>> =
|
<<SegmentID:16/integer, ExtraHash:32/integer, _Rest/binary>> =
|
||||||
crypto:hash(md5, Key),
|
crypto:hash(md5, Key),
|
||||||
{SegmentID, ExtraHash};
|
{SegmentID, ExtraHash};
|
||||||
segment_hash(Key) ->
|
segment_hash(Key) ->
|
||||||
|
|
|
@ -16,8 +16,8 @@
|
||||||
check_hash/2
|
check_hash/2
|
||||||
]).
|
]).
|
||||||
|
|
||||||
-define(BITS_PER_KEY, 8). % Must be 8 or 4
|
-define(BLOOM_SIZE_BYTES, 16).
|
||||||
-define(INTEGER_SIZE, ?BITS_PER_KEY * 8).
|
-define(INTEGER_SIZE, 128).
|
||||||
-define(BAND_MASK, ?INTEGER_SIZE - 1).
|
-define(BAND_MASK, ?INTEGER_SIZE - 1).
|
||||||
|
|
||||||
|
|
||||||
|
@ -34,9 +34,8 @@ create_bloom(HashList) ->
|
||||||
<<>>;
|
<<>>;
|
||||||
L when L > 32 ->
|
L when L > 32 ->
|
||||||
add_hashlist(HashList,
|
add_hashlist(HashList,
|
||||||
15,
|
7,
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
0, 0, 0, 0, 0, 0, 0, 0);
|
||||||
0, 0, 0, 0, 0, 0);
|
|
||||||
L when L > 16 ->
|
L when L > 16 ->
|
||||||
add_hashlist(HashList, 3, 0, 0, 0, 0);
|
add_hashlist(HashList, 3, 0, 0, 0, 0);
|
||||||
_ ->
|
_ ->
|
||||||
|
@ -49,10 +48,10 @@ create_bloom(HashList) ->
|
||||||
check_hash(_Hash, <<>>) ->
|
check_hash(_Hash, <<>>) ->
|
||||||
false;
|
false;
|
||||||
check_hash({_SegHash, Hash}, BloomBin) ->
|
check_hash({_SegHash, Hash}, BloomBin) ->
|
||||||
SlotSplit = (byte_size(BloomBin) div ?BITS_PER_KEY) - 1,
|
SlotSplit = (byte_size(BloomBin) div ?BLOOM_SIZE_BYTES) - 1,
|
||||||
{Slot, H0, H1, H2, H3} = split_hash(Hash, SlotSplit),
|
{Slot, Hashes} = split_hash(Hash, SlotSplit),
|
||||||
Mask = get_mask(H0, H1, H2, H3),
|
Mask = get_mask(Hashes),
|
||||||
Pos = Slot * ?BITS_PER_KEY,
|
Pos = Slot * ?BLOOM_SIZE_BYTES,
|
||||||
IntSize = ?INTEGER_SIZE,
|
IntSize = ?INTEGER_SIZE,
|
||||||
<<_H:Pos/binary, CheckInt:IntSize/integer, _T/binary>> = BloomBin,
|
<<_H:Pos/binary, CheckInt:IntSize/integer, _T/binary>> = BloomBin,
|
||||||
case CheckInt band Mask of
|
case CheckInt band Mask of
|
||||||
|
@ -69,12 +68,12 @@ check_hash({_SegHash, Hash}, BloomBin) ->
|
||||||
split_hash(Hash, SlotSplit) ->
|
split_hash(Hash, SlotSplit) ->
|
||||||
Slot = Hash band SlotSplit,
|
Slot = Hash band SlotSplit,
|
||||||
H0 = (Hash bsr 4) band (?BAND_MASK),
|
H0 = (Hash bsr 4) band (?BAND_MASK),
|
||||||
H1 = (Hash bsr 10) band (?BAND_MASK),
|
H1 = (Hash bsr 11) band (?BAND_MASK),
|
||||||
H2 = (Hash bsr 16) band (?BAND_MASK),
|
H2 = (Hash bsr 18) band (?BAND_MASK),
|
||||||
H3 = (Hash bsr 24) band (?BAND_MASK),
|
H3 = (Hash bsr 25) band (?BAND_MASK),
|
||||||
{Slot, H0, H1, H2, H3}.
|
{Slot, [H0, H1, H2, H3]}.
|
||||||
|
|
||||||
get_mask(H0, H1, H2, H3) ->
|
get_mask([H0, H1, H2, H3]) ->
|
||||||
(1 bsl H0) bor (1 bsl H1) bor (1 bsl H2) bor (1 bsl H3).
|
(1 bsl H0) bor (1 bsl H1) bor (1 bsl H2) bor (1 bsl H3).
|
||||||
|
|
||||||
|
|
||||||
|
@ -85,8 +84,8 @@ add_hashlist([], _S, S0, S1) ->
|
||||||
IntSize = ?INTEGER_SIZE,
|
IntSize = ?INTEGER_SIZE,
|
||||||
<<S0:IntSize/integer, S1:IntSize/integer>>;
|
<<S0:IntSize/integer, S1:IntSize/integer>>;
|
||||||
add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) ->
|
add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) ->
|
||||||
{Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit),
|
{Slot, Hashes} = split_hash(TopHash, SlotSplit),
|
||||||
Mask = get_mask(H0, H1, H2, H3),
|
Mask = get_mask(Hashes),
|
||||||
case Slot of
|
case Slot of
|
||||||
0 ->
|
0 ->
|
||||||
add_hashlist(T, SlotSplit, S0 bor Mask, S1);
|
add_hashlist(T, SlotSplit, S0 bor Mask, S1);
|
||||||
|
@ -99,8 +98,8 @@ add_hashlist([], _S, S0, S1, S2, S3) ->
|
||||||
<<S0:IntSize/integer, S1:IntSize/integer,
|
<<S0:IntSize/integer, S1:IntSize/integer,
|
||||||
S2:IntSize/integer, S3:IntSize/integer>>;
|
S2:IntSize/integer, S3:IntSize/integer>>;
|
||||||
add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) ->
|
add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) ->
|
||||||
{Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit),
|
{Slot, Hashes} = split_hash(TopHash, SlotSplit),
|
||||||
Mask = get_mask(H0, H1, H2, H3),
|
Mask = get_mask(Hashes),
|
||||||
case Slot of
|
case Slot of
|
||||||
0 ->
|
0 ->
|
||||||
add_hashlist(T, SlotSplit, S0 bor Mask, S1, S2, S3);
|
add_hashlist(T, SlotSplit, S0 bor Mask, S1, S2, S3);
|
||||||
|
@ -112,104 +111,50 @@ add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) ->
|
||||||
add_hashlist(T, SlotSplit, S0, S1, S2, S3 bor Mask)
|
add_hashlist(T, SlotSplit, S0, S1, S2, S3 bor Mask)
|
||||||
end.
|
end.
|
||||||
|
|
||||||
add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
|
add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7) ->
|
||||||
SA, SB, SC, SD, SE, SF) ->
|
|
||||||
IntSize = ?INTEGER_SIZE,
|
IntSize = ?INTEGER_SIZE,
|
||||||
<<S0:IntSize/integer, S1:IntSize/integer,
|
<<S0:IntSize/integer, S1:IntSize/integer,
|
||||||
S2:IntSize/integer, S3:IntSize/integer,
|
S2:IntSize/integer, S3:IntSize/integer,
|
||||||
S4:IntSize/integer, S5:IntSize/integer,
|
S4:IntSize/integer, S5:IntSize/integer,
|
||||||
S6:IntSize/integer, S7:IntSize/integer,
|
S6:IntSize/integer, S7:IntSize/integer>>;
|
||||||
S8:IntSize/integer, S9:IntSize/integer,
|
|
||||||
SA:IntSize/integer, SB:IntSize/integer,
|
|
||||||
SC:IntSize/integer, SD:IntSize/integer,
|
|
||||||
SE:IntSize/integer, SF:IntSize/integer>>;
|
|
||||||
add_hashlist([{_SegHash, TopHash}|T],
|
add_hashlist([{_SegHash, TopHash}|T],
|
||||||
SlotSplit,
|
SlotSplit,
|
||||||
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
|
S0, S1, S2, S3, S4, S5, S6, S7) ->
|
||||||
SA, SB, SC, SD, SE, SF) ->
|
{Slot, Hashes} = split_hash(TopHash, SlotSplit),
|
||||||
{Slot, H0, H1, H2, H3} = split_hash(TopHash, SlotSplit),
|
Mask = get_mask(Hashes),
|
||||||
Mask = get_mask(H0, H1, H2, H3),
|
|
||||||
case Slot of
|
case Slot of
|
||||||
0 ->
|
0 ->
|
||||||
add_hashlist(T,
|
add_hashlist(T,
|
||||||
SlotSplit,
|
SlotSplit,
|
||||||
S0 bor Mask, S1, S2, S3, S4, S5, S6, S7, S8, S9,
|
S0 bor Mask, S1, S2, S3, S4, S5, S6, S7);
|
||||||
SA, SB, SC, SD, SE, SF);
|
|
||||||
1 ->
|
1 ->
|
||||||
add_hashlist(T,
|
add_hashlist(T,
|
||||||
SlotSplit,
|
SlotSplit,
|
||||||
S0, S1 bor Mask, S2, S3, S4, S5, S6, S7, S8, S9,
|
S0, S1 bor Mask, S2, S3, S4, S5, S6, S7);
|
||||||
SA, SB, SC, SD, SE, SF);
|
|
||||||
2 ->
|
2 ->
|
||||||
add_hashlist(T,
|
add_hashlist(T,
|
||||||
SlotSplit,
|
SlotSplit,
|
||||||
S0, S1, S2 bor Mask, S3, S4, S5, S6, S7, S8, S9,
|
S0, S1, S2 bor Mask, S3, S4, S5, S6, S7);
|
||||||
SA, SB, SC, SD, SE, SF);
|
|
||||||
3 ->
|
3 ->
|
||||||
add_hashlist(T,
|
add_hashlist(T,
|
||||||
SlotSplit,
|
SlotSplit,
|
||||||
S0, S1, S2, S3 bor Mask, S4, S5, S6, S7, S8, S9,
|
S0, S1, S2, S3 bor Mask, S4, S5, S6, S7);
|
||||||
SA, SB, SC, SD, SE, SF);
|
|
||||||
4 ->
|
4 ->
|
||||||
add_hashlist(T,
|
add_hashlist(T,
|
||||||
SlotSplit,
|
SlotSplit,
|
||||||
S0, S1, S2, S3, S4 bor Mask, S5, S6, S7, S8, S9,
|
S0, S1, S2, S3, S4 bor Mask, S5, S6, S7);
|
||||||
SA, SB, SC, SD, SE, SF);
|
|
||||||
5 ->
|
5 ->
|
||||||
add_hashlist(T,
|
add_hashlist(T,
|
||||||
SlotSplit,
|
SlotSplit,
|
||||||
S0, S1, S2, S3, S4, S5 bor Mask, S6, S7, S8, S9,
|
S0, S1, S2, S3, S4, S5 bor Mask, S6, S7);
|
||||||
SA, SB, SC, SD, SE, SF);
|
|
||||||
6 ->
|
6 ->
|
||||||
add_hashlist(T,
|
add_hashlist(T,
|
||||||
SlotSplit,
|
SlotSplit,
|
||||||
S0, S1, S2, S3, S4, S5, S6 bor Mask, S7, S8, S9,
|
S0, S1, S2, S3, S4, S5, S6 bor Mask, S7);
|
||||||
SA, SB, SC, SD, SE, SF);
|
|
||||||
7 ->
|
7 ->
|
||||||
add_hashlist(T,
|
add_hashlist(T,
|
||||||
SlotSplit,
|
SlotSplit,
|
||||||
S0, S1, S2, S3, S4, S5, S6, S7 bor Mask, S8, S9,
|
S0, S1, S2, S3, S4, S5, S6, S7 bor Mask)
|
||||||
SA, SB, SC, SD, SE, SF);
|
|
||||||
8 ->
|
|
||||||
add_hashlist(T,
|
|
||||||
SlotSplit,
|
|
||||||
S0, S1, S2, S3, S4, S5, S6, S7, S8 bor Mask, S9,
|
|
||||||
SA, SB, SC, SD, SE, SF);
|
|
||||||
9 ->
|
|
||||||
add_hashlist(T,
|
|
||||||
SlotSplit,
|
|
||||||
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9 bor Mask,
|
|
||||||
SA, SB, SC, SD, SE, SF);
|
|
||||||
10 ->
|
|
||||||
add_hashlist(T,
|
|
||||||
SlotSplit,
|
|
||||||
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
|
|
||||||
SA bor Mask, SB, SC, SD, SE, SF);
|
|
||||||
11 ->
|
|
||||||
add_hashlist(T,
|
|
||||||
SlotSplit,
|
|
||||||
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
|
|
||||||
SA, SB bor Mask, SC, SD, SE, SF);
|
|
||||||
12 ->
|
|
||||||
add_hashlist(T,
|
|
||||||
SlotSplit,
|
|
||||||
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
|
|
||||||
SA, SB, SC bor Mask, SD, SE, SF);
|
|
||||||
13 ->
|
|
||||||
add_hashlist(T,
|
|
||||||
SlotSplit,
|
|
||||||
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
|
|
||||||
SA, SB, SC, SD bor Mask, SE, SF);
|
|
||||||
14 ->
|
|
||||||
add_hashlist(T,
|
|
||||||
SlotSplit,
|
|
||||||
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
|
|
||||||
SA, SB, SC, SD, SE bor Mask, SF);
|
|
||||||
15 ->
|
|
||||||
add_hashlist(T,
|
|
||||||
SlotSplit,
|
|
||||||
S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
|
|
||||||
SA, SB, SC, SD, SE, SF bor Mask)
|
|
||||||
end.
|
end.
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue