283 lines
9.1 KiB
Erlang
283 lines
9.1 KiB
Erlang
%% Used for creating fixed-size self-regulating encoded bloom filters
|
|
%%
|
|
%% Normally a bloom filter in order to achieve optimium size increases the
|
|
%% number of hashes as the desired false positive rate increases. There is
|
|
%% a processing overhead for checking this bloom, both because of the number
|
|
%% of hash calculations required, and also because of the need to CRC check
|
|
%% the bloom to ensure a false negative result is not returned due to
|
|
%% corruption.
|
|
%%
|
|
%% A more space efficient bloom can be achieved through the compression of
|
|
%% bloom filters with less hashes (and in an optimal case a single hash).
|
|
%% This can be achieved using rice encoding.
|
|
%%
|
|
%% Rice-encoding and single hash blooms are used here in order to provide an
|
|
%% optimally space efficient solution, but also as the processing required to
|
|
%% support uncompression can be concurrently performing a checksum role.
|
|
%%
|
|
%% For this to work, the bloom is divided into 64 parts and a 32-bit hash is
|
|
%% required. Each hash is placed into one of 64 blooms based on the six least
|
|
%% significant bits of the hash, and the fmost significant 26-bits are used
|
|
%% to indicate the bit to be added to the bloom.
|
|
%%
|
|
%% The bloom is then created by calculating the differences between the ordered
|
|
%% elements of the hash list and representing the difference using an exponent
|
|
%% and a 13-bit remainder i.e.
|
|
%% 8000 -> 0 11111 01000000
|
|
%% 10000 -> 10 00000 00010000
|
|
%% 20000 -> 110 01110 00100000
|
|
%%
|
|
%% Each bloom should have approximately 64 differences.
|
|
%%
|
|
%% Fronting the bloom is a bloom index, formed first by 16 pairs of 3-byte
|
|
%% max hash, 2-byte length (bits) - with then each of the encoded bitstrings
|
|
%% appended. The max hash is the total of all the differences (which should
|
|
%% be the highest hash in the bloom).
|
|
%%
|
|
%% To check a key against the bloom, hash it, take the four least signifcant
|
|
%% bits and read the start pointer, max hash end pointer from the expected
|
|
%% positions in the bloom index. Then roll through from the start pointer to
|
|
%% the end pointer, accumulating each difference. There is a possible match if
|
|
%% either the accumulator hits the expected hash or the max hash doesn't match
|
|
%% the final accumulator (to cover if the bloom has been corrupted by a bit
|
|
%% flip somwhere). A miss is more than twice as expensive (on average) than a
|
|
%% potential match - but still only requires around 64 integer additions
|
|
%% and the processing of <100 bytes of data.
|
|
%%
|
|
%% For 2048 keys, this takes up <4KB. The false positive rate is 0.000122
|
|
%% This compares favourably for the equivalent size optimal bloom which
|
|
%% would require 11 hashes and have a false positive rate of 0.000459.
|
|
%% Checking with a positive match should take on average about 6 microseconds,
|
|
%% and a negative match should take around 11 microseconds.
|
|
%%
|
|
%% See ../test/rice_test.erl for proving timings and fpr.
|
|
|
|
|
|
|
|
-module(leveled_rice).
|
|
|
|
-export([create_bloom/1,
|
|
check_key/2,
|
|
check_keys/2]).
|
|
|
|
-include_lib("eunit/include/eunit.hrl").
|
|
|
|
-define(SLOT_COUNT, 64).
|
|
-define(MAX_HASH, 16777216).
|
|
-define(DIVISOR_BITS, 13).
|
|
-define(DIVISOR, 8092).
|
|
|
|
%% Create a bitstring representing the bloom filter from a key list
|
|
|
|
create_bloom(KeyList) ->
|
|
create_bloom(KeyList, ?SLOT_COUNT, ?MAX_HASH).
|
|
|
|
create_bloom(KeyList, SlotCount, MaxHash) ->
|
|
HashLists = array:new(SlotCount, [{default, []}]),
|
|
OrdHashLists = create_hashlist(KeyList, HashLists, SlotCount, MaxHash),
|
|
serialise_bloom(OrdHashLists).
|
|
|
|
|
|
%% Checking for a key
|
|
|
|
check_keys([], _) ->
|
|
true;
|
|
check_keys([Key|Rest], BitStr) ->
|
|
case check_key(Key, BitStr) of
|
|
false ->
|
|
false;
|
|
true ->
|
|
check_keys(Rest, BitStr)
|
|
end.
|
|
|
|
check_key(Key, BitStr) ->
|
|
check_key(Key, BitStr, ?SLOT_COUNT, ?MAX_HASH, ?DIVISOR_BITS, ?DIVISOR).
|
|
|
|
check_key(Key, BitStr, SlotCount, MaxHash, Factor, Divisor) ->
|
|
{Slot, Hash} = get_slothash(Key, MaxHash, SlotCount),
|
|
{StartPos, Length, TopHash} = find_position(Slot, BitStr, 0, 40 * SlotCount),
|
|
case BitStr of
|
|
<<_:StartPos/bitstring, Bloom:Length/bitstring, _/bitstring>> ->
|
|
check_hash(Hash, Bloom, Factor, Divisor, 0, TopHash);
|
|
_ ->
|
|
io:format("Possible corruption of bloom index ~n"),
|
|
true
|
|
end.
|
|
|
|
find_position(Slot, BloomIndex, Counter, StartPosition) ->
|
|
<<TopHash:24/integer, Length:16/integer, Rest/bitstring>> = BloomIndex,
|
|
case Slot of
|
|
Counter ->
|
|
{StartPosition, Length, TopHash};
|
|
_ ->
|
|
find_position(Slot, Rest, Counter + 1, StartPosition + Length)
|
|
end.
|
|
|
|
|
|
% Checking for a hash within a bloom
|
|
|
|
check_hash(_, <<>>, _, _, Acc, MaxHash) ->
|
|
case Acc of
|
|
MaxHash ->
|
|
false;
|
|
_ ->
|
|
io:format("Failure of CRC check on bloom filter~n"),
|
|
true
|
|
end;
|
|
check_hash(HashToCheck, BitStr, Factor, Divisor, Acc, TopHash) ->
|
|
case findexponent(BitStr) of
|
|
{ok, Exponent, BitStrTail} ->
|
|
case findremainder(BitStrTail, Factor) of
|
|
{ok, Remainder, BitStrTail2} ->
|
|
NextHash = Acc + Divisor * Exponent + Remainder,
|
|
case NextHash of
|
|
HashToCheck ->
|
|
true;
|
|
_ ->
|
|
check_hash(HashToCheck, BitStrTail2, Factor,
|
|
Divisor, NextHash, TopHash)
|
|
end;
|
|
error ->
|
|
io:format("Failure of CRC check on bloom filter~n"),
|
|
true
|
|
end;
|
|
error ->
|
|
io:format("Failure of CRC check on bloom filter~n"),
|
|
true
|
|
end.
|
|
|
|
%% Convert the key list into an array of sorted hash lists
|
|
|
|
create_hashlist([], HashLists, _, _) ->
|
|
HashLists;
|
|
create_hashlist([HeadKey|Rest], HashLists, SlotCount, MaxHash) ->
|
|
{Slot, Hash} = get_slothash(HeadKey, MaxHash, SlotCount),
|
|
HashList = array:get(Slot, HashLists),
|
|
create_hashlist(Rest,
|
|
array:set(Slot, lists:usort([Hash|HashList]), HashLists),
|
|
SlotCount, MaxHash).
|
|
|
|
%% Convert an array of hash lists into an serialsed bloom
|
|
|
|
serialise_bloom(HashLists) ->
|
|
SlotCount = array:size(HashLists),
|
|
serialise_bloom(HashLists, SlotCount, 0, []).
|
|
|
|
serialise_bloom(HashLists, SlotCount, Counter, Blooms) ->
|
|
case Counter of
|
|
SlotCount ->
|
|
finalise_bloom(Blooms);
|
|
_ ->
|
|
Bloom = serialise_singlebloom(array:get(Counter, HashLists)),
|
|
serialise_bloom(HashLists, SlotCount, Counter + 1, [Bloom|Blooms])
|
|
end.
|
|
|
|
serialise_singlebloom(HashList) ->
|
|
serialise_singlebloom(HashList, <<>>, 0, ?DIVISOR, ?DIVISOR_BITS).
|
|
|
|
serialise_singlebloom([], BloomStr, TopHash, _, _) ->
|
|
% io:format("Single bloom created with bloom of ~w and top hash of ~w~n", [BloomStr, TopHash]),
|
|
{BloomStr, TopHash};
|
|
serialise_singlebloom([Hash|Rest], BloomStr, TopHash, Divisor, Factor) ->
|
|
HashGap = Hash - TopHash,
|
|
Exponent = buildexponent(HashGap div Divisor),
|
|
Remainder = HashGap rem Divisor,
|
|
NewBloomStr = <<BloomStr/bitstring, Exponent/bitstring, Remainder:Factor/integer>>,
|
|
serialise_singlebloom(Rest, NewBloomStr, Hash, Divisor, Factor).
|
|
|
|
|
|
finalise_bloom(Blooms) ->
|
|
finalise_bloom(Blooms, {<<>>, <<>>}).
|
|
|
|
finalise_bloom([], BloomAcc) ->
|
|
{BloomIndex, BloomStr} = BloomAcc,
|
|
<<BloomIndex/bitstring, BloomStr/bitstring>>;
|
|
finalise_bloom([Bloom|Rest], BloomAcc) ->
|
|
{BloomStr, TopHash} = Bloom,
|
|
{BloomIndexAcc, BloomStrAcc} = BloomAcc,
|
|
Length = bit_size(BloomStr),
|
|
UpdIdx = <<TopHash:24/integer, Length:16/integer, BloomIndexAcc/bitstring>>,
|
|
% io:format("Adding bloom string of ~w to bloom~n", [BloomStr]),
|
|
UpdBloomStr = <<BloomStr/bitstring, BloomStrAcc/bitstring>>,
|
|
finalise_bloom(Rest, {UpdIdx, UpdBloomStr}).
|
|
|
|
|
|
|
|
|
|
buildexponent(Exponent) ->
|
|
buildexponent(Exponent, <<0:1>>).
|
|
|
|
buildexponent(0, OutputBits) ->
|
|
OutputBits;
|
|
buildexponent(Exponent, OutputBits) ->
|
|
buildexponent(Exponent - 1, <<1:1, OutputBits/bitstring>>).
|
|
|
|
|
|
findexponent(BitStr) ->
|
|
findexponent(BitStr, 0).
|
|
|
|
findexponent(<<>>, _) ->
|
|
error;
|
|
findexponent(<<H:1/integer, T/bitstring>>, Acc) ->
|
|
case H of
|
|
1 -> findexponent(T, Acc + 1);
|
|
0 -> {ok, Acc, T}
|
|
end.
|
|
|
|
|
|
findremainder(BitStr, Factor) ->
|
|
case BitStr of
|
|
<<Remainder:Factor/integer, BitStrTail/bitstring>> ->
|
|
{ok, Remainder, BitStrTail};
|
|
_ ->
|
|
error
|
|
end.
|
|
|
|
|
|
get_slothash(Key, MaxHash, SlotCount) ->
|
|
Hash = erlang:phash2(Key, MaxHash),
|
|
{Hash rem SlotCount, Hash div SlotCount}.
|
|
|
|
|
|
%%%%%%%%%%%%%%%%
|
|
% T E S T
|
|
%%%%%%%%%%%%%%%
|
|
|
|
corrupt_bloom(Bloom) ->
|
|
Length = bit_size(Bloom),
|
|
Random = random:uniform(Length),
|
|
<<Part1:Random/bitstring, Bit:1/integer, Rest1/bitstring>> = Bloom,
|
|
case Bit of
|
|
1 ->
|
|
<<Part1/bitstring, 0:1/integer, Rest1/bitstring>>;
|
|
0 ->
|
|
<<Part1/bitstring, 1:1/integer, Rest1/bitstring>>
|
|
end.
|
|
|
|
bloom_test() ->
|
|
KeyList = ["key1", "key2", "key3", "key4"],
|
|
Bloom = create_bloom(KeyList),
|
|
io:format("Bloom of ~w of length ~w ~n", [Bloom, bit_size(Bloom)]),
|
|
?assertMatch(true, check_key("key1", Bloom)),
|
|
?assertMatch(true, check_key("key2", Bloom)),
|
|
?assertMatch(true, check_key("key3", Bloom)),
|
|
?assertMatch(true, check_key("key4", Bloom)),
|
|
?assertMatch(false, check_key("key5", Bloom)).
|
|
|
|
bloom_corruption_test() ->
|
|
KeyList = ["key1", "key2", "key3", "key4"],
|
|
Bloom = create_bloom(KeyList),
|
|
Bloom1 = corrupt_bloom(Bloom),
|
|
?assertMatch(true, check_keys(KeyList, Bloom1)),
|
|
Bloom2 = corrupt_bloom(Bloom),
|
|
?assertMatch(true, check_keys(KeyList, Bloom2)),
|
|
Bloom3 = corrupt_bloom(Bloom),
|
|
?assertMatch(true, check_keys(KeyList, Bloom3)),
|
|
Bloom4 = corrupt_bloom(Bloom),
|
|
?assertMatch(true, check_keys(KeyList, Bloom4)),
|
|
Bloom5 = corrupt_bloom(Bloom),
|
|
?assertMatch(true, check_keys(KeyList, Bloom5)),
|
|
Bloom6 = corrupt_bloom(Bloom),
|
|
?assertMatch(true, check_keys(KeyList, Bloom6)).
|
|
|
|
|