FAdvise
Add fadvise magic to SFT files. Also delete unnecessary rice modeule
This commit is contained in:
parent
938cc0fc16
commit
2d981cb2e7
2 changed files with 7 additions and 304 deletions
|
@ -1,283 +0,0 @@
|
|||
%% Used for creating fixed-size self-regulating encoded bloom filters
|
||||
%%
|
||||
%% Normally a bloom filter in order to achieve optimium size increases the
|
||||
%% number of hashes as the desired false positive rate increases. There is
|
||||
%% a processing overhead for checking this bloom, both because of the number
|
||||
%% of hash calculations required, and also because of the need to CRC check
|
||||
%% the bloom to ensure a false negative result is not returned due to
|
||||
%% corruption.
|
||||
%%
|
||||
%% A more space efficient bloom can be achieved through the compression of
|
||||
%% bloom filters with less hashes (and in an optimal case a single hash).
|
||||
%% This can be achieved using rice encoding.
|
||||
%%
|
||||
%% Rice-encoding and single hash blooms are used here in order to provide an
|
||||
%% optimally space efficient solution, but also as the processing required to
|
||||
%% support uncompression can be concurrently performing a checksum role.
|
||||
%%
|
||||
%% For this to work, the bloom is divided into 64 parts and a 32-bit hash is
|
||||
%% required. Each hash is placed into one of 64 blooms based on the six least
|
||||
%% significant bits of the hash, and the fmost significant 26-bits are used
|
||||
%% to indicate the bit to be added to the bloom.
|
||||
%%
|
||||
%% The bloom is then created by calculating the differences between the ordered
|
||||
%% elements of the hash list and representing the difference using an exponent
|
||||
%% and a 13-bit remainder i.e.
|
||||
%% 8000 -> 0 11111 01000000
|
||||
%% 10000 -> 10 00000 00010000
|
||||
%% 20000 -> 110 01110 00100000
|
||||
%%
|
||||
%% Each bloom should have approximately 64 differences.
|
||||
%%
|
||||
%% Fronting the bloom is a bloom index, formed first by 16 pairs of 3-byte
|
||||
%% max hash, 2-byte length (bits) - with then each of the encoded bitstrings
|
||||
%% appended. The max hash is the total of all the differences (which should
|
||||
%% be the highest hash in the bloom).
|
||||
%%
|
||||
%% To check a key against the bloom, hash it, take the four least signifcant
|
||||
%% bits and read the start pointer, max hash end pointer from the expected
|
||||
%% positions in the bloom index. Then roll through from the start pointer to
|
||||
%% the end pointer, accumulating each difference. There is a possible match if
|
||||
%% either the accumulator hits the expected hash or the max hash doesn't match
|
||||
%% the final accumulator (to cover if the bloom has been corrupted by a bit
|
||||
%% flip somwhere). A miss is more than twice as expensive (on average) than a
|
||||
%% potential match - but still only requires around 64 integer additions
|
||||
%% and the processing of <100 bytes of data.
|
||||
%%
|
||||
%% For 2048 keys, this takes up <4KB. The false positive rate is 0.000122
|
||||
%% This compares favourably for the equivalent size optimal bloom which
|
||||
%% would require 11 hashes and have a false positive rate of 0.000459.
|
||||
%% Checking with a positive match should take on average about 6 microseconds,
|
||||
%% and a negative match should take around 11 microseconds.
|
||||
%%
|
||||
%% See ../test/rice_test.erl for proving timings and fpr.
|
||||
|
||||
|
||||
|
||||
-module(leveled_rice).
|
||||
|
||||
-export([create_bloom/1,
|
||||
check_key/2,
|
||||
check_keys/2]).
|
||||
|
||||
-include_lib("eunit/include/eunit.hrl").
|
||||
|
||||
-define(SLOT_COUNT, 64).
|
||||
-define(MAX_HASH, 16777216).
|
||||
-define(DIVISOR_BITS, 13).
|
||||
-define(DIVISOR, 8092).
|
||||
|
||||
%% Create a bitstring representing the bloom filter from a key list
|
||||
|
||||
create_bloom(KeyList) ->
|
||||
create_bloom(KeyList, ?SLOT_COUNT, ?MAX_HASH).
|
||||
|
||||
create_bloom(KeyList, SlotCount, MaxHash) ->
|
||||
HashLists = array:new(SlotCount, [{default, []}]),
|
||||
OrdHashLists = create_hashlist(KeyList, HashLists, SlotCount, MaxHash),
|
||||
serialise_bloom(OrdHashLists).
|
||||
|
||||
|
||||
%% Checking for a key
|
||||
|
||||
check_keys([], _) ->
|
||||
true;
|
||||
check_keys([Key|Rest], BitStr) ->
|
||||
case check_key(Key, BitStr) of
|
||||
false ->
|
||||
false;
|
||||
true ->
|
||||
check_keys(Rest, BitStr)
|
||||
end.
|
||||
|
||||
check_key(Key, BitStr) ->
|
||||
check_key(Key, BitStr, ?SLOT_COUNT, ?MAX_HASH, ?DIVISOR_BITS, ?DIVISOR).
|
||||
|
||||
check_key(Key, BitStr, SlotCount, MaxHash, Factor, Divisor) ->
|
||||
{Slot, Hash} = get_slothash(Key, MaxHash, SlotCount),
|
||||
{StartPos, Length, TopHash} = find_position(Slot, BitStr, 0, 40 * SlotCount),
|
||||
case BitStr of
|
||||
<<_:StartPos/bitstring, Bloom:Length/bitstring, _/bitstring>> ->
|
||||
check_hash(Hash, Bloom, Factor, Divisor, 0, TopHash);
|
||||
_ ->
|
||||
io:format("Possible corruption of bloom index ~n"),
|
||||
true
|
||||
end.
|
||||
|
||||
find_position(Slot, BloomIndex, Counter, StartPosition) ->
|
||||
<<TopHash:24/integer, Length:16/integer, Rest/bitstring>> = BloomIndex,
|
||||
case Slot of
|
||||
Counter ->
|
||||
{StartPosition, Length, TopHash};
|
||||
_ ->
|
||||
find_position(Slot, Rest, Counter + 1, StartPosition + Length)
|
||||
end.
|
||||
|
||||
|
||||
% Checking for a hash within a bloom
|
||||
|
||||
check_hash(_, <<>>, _, _, Acc, MaxHash) ->
|
||||
case Acc of
|
||||
MaxHash ->
|
||||
false;
|
||||
_ ->
|
||||
io:format("Failure of CRC check on bloom filter~n"),
|
||||
true
|
||||
end;
|
||||
check_hash(HashToCheck, BitStr, Factor, Divisor, Acc, TopHash) ->
|
||||
case findexponent(BitStr) of
|
||||
{ok, Exponent, BitStrTail} ->
|
||||
case findremainder(BitStrTail, Factor) of
|
||||
{ok, Remainder, BitStrTail2} ->
|
||||
NextHash = Acc + Divisor * Exponent + Remainder,
|
||||
case NextHash of
|
||||
HashToCheck ->
|
||||
true;
|
||||
_ ->
|
||||
check_hash(HashToCheck, BitStrTail2, Factor,
|
||||
Divisor, NextHash, TopHash)
|
||||
end;
|
||||
error ->
|
||||
io:format("Failure of CRC check on bloom filter~n"),
|
||||
true
|
||||
end;
|
||||
error ->
|
||||
io:format("Failure of CRC check on bloom filter~n"),
|
||||
true
|
||||
end.
|
||||
|
||||
%% Convert the key list into an array of sorted hash lists
|
||||
|
||||
create_hashlist([], HashLists, _, _) ->
|
||||
HashLists;
|
||||
create_hashlist([HeadKey|Rest], HashLists, SlotCount, MaxHash) ->
|
||||
{Slot, Hash} = get_slothash(HeadKey, MaxHash, SlotCount),
|
||||
HashList = array:get(Slot, HashLists),
|
||||
create_hashlist(Rest,
|
||||
array:set(Slot, lists:usort([Hash|HashList]), HashLists),
|
||||
SlotCount, MaxHash).
|
||||
|
||||
%% Convert an array of hash lists into an serialsed bloom
|
||||
|
||||
serialise_bloom(HashLists) ->
|
||||
SlotCount = array:size(HashLists),
|
||||
serialise_bloom(HashLists, SlotCount, 0, []).
|
||||
|
||||
serialise_bloom(HashLists, SlotCount, Counter, Blooms) ->
|
||||
case Counter of
|
||||
SlotCount ->
|
||||
finalise_bloom(Blooms);
|
||||
_ ->
|
||||
Bloom = serialise_singlebloom(array:get(Counter, HashLists)),
|
||||
serialise_bloom(HashLists, SlotCount, Counter + 1, [Bloom|Blooms])
|
||||
end.
|
||||
|
||||
serialise_singlebloom(HashList) ->
|
||||
serialise_singlebloom(HashList, <<>>, 0, ?DIVISOR, ?DIVISOR_BITS).
|
||||
|
||||
serialise_singlebloom([], BloomStr, TopHash, _, _) ->
|
||||
% io:format("Single bloom created with bloom of ~w and top hash of ~w~n", [BloomStr, TopHash]),
|
||||
{BloomStr, TopHash};
|
||||
serialise_singlebloom([Hash|Rest], BloomStr, TopHash, Divisor, Factor) ->
|
||||
HashGap = Hash - TopHash,
|
||||
Exponent = buildexponent(HashGap div Divisor),
|
||||
Remainder = HashGap rem Divisor,
|
||||
NewBloomStr = <<BloomStr/bitstring, Exponent/bitstring, Remainder:Factor/integer>>,
|
||||
serialise_singlebloom(Rest, NewBloomStr, Hash, Divisor, Factor).
|
||||
|
||||
|
||||
finalise_bloom(Blooms) ->
|
||||
finalise_bloom(Blooms, {<<>>, <<>>}).
|
||||
|
||||
finalise_bloom([], BloomAcc) ->
|
||||
{BloomIndex, BloomStr} = BloomAcc,
|
||||
<<BloomIndex/bitstring, BloomStr/bitstring>>;
|
||||
finalise_bloom([Bloom|Rest], BloomAcc) ->
|
||||
{BloomStr, TopHash} = Bloom,
|
||||
{BloomIndexAcc, BloomStrAcc} = BloomAcc,
|
||||
Length = bit_size(BloomStr),
|
||||
UpdIdx = <<TopHash:24/integer, Length:16/integer, BloomIndexAcc/bitstring>>,
|
||||
% io:format("Adding bloom string of ~w to bloom~n", [BloomStr]),
|
||||
UpdBloomStr = <<BloomStr/bitstring, BloomStrAcc/bitstring>>,
|
||||
finalise_bloom(Rest, {UpdIdx, UpdBloomStr}).
|
||||
|
||||
|
||||
|
||||
|
||||
buildexponent(Exponent) ->
|
||||
buildexponent(Exponent, <<0:1>>).
|
||||
|
||||
buildexponent(0, OutputBits) ->
|
||||
OutputBits;
|
||||
buildexponent(Exponent, OutputBits) ->
|
||||
buildexponent(Exponent - 1, <<1:1, OutputBits/bitstring>>).
|
||||
|
||||
|
||||
findexponent(BitStr) ->
|
||||
findexponent(BitStr, 0).
|
||||
|
||||
findexponent(<<>>, _) ->
|
||||
error;
|
||||
findexponent(<<H:1/integer, T/bitstring>>, Acc) ->
|
||||
case H of
|
||||
1 -> findexponent(T, Acc + 1);
|
||||
0 -> {ok, Acc, T}
|
||||
end.
|
||||
|
||||
|
||||
findremainder(BitStr, Factor) ->
|
||||
case BitStr of
|
||||
<<Remainder:Factor/integer, BitStrTail/bitstring>> ->
|
||||
{ok, Remainder, BitStrTail};
|
||||
_ ->
|
||||
error
|
||||
end.
|
||||
|
||||
|
||||
get_slothash(Key, MaxHash, SlotCount) ->
|
||||
Hash = erlang:phash2(Key, MaxHash),
|
||||
{Hash rem SlotCount, Hash div SlotCount}.
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%
|
||||
% T E S T
|
||||
%%%%%%%%%%%%%%%
|
||||
|
||||
corrupt_bloom(Bloom) ->
|
||||
Length = bit_size(Bloom),
|
||||
Random = random:uniform(Length),
|
||||
<<Part1:Random/bitstring, Bit:1/integer, Rest1/bitstring>> = Bloom,
|
||||
case Bit of
|
||||
1 ->
|
||||
<<Part1/bitstring, 0:1/integer, Rest1/bitstring>>;
|
||||
0 ->
|
||||
<<Part1/bitstring, 1:1/integer, Rest1/bitstring>>
|
||||
end.
|
||||
|
||||
bloom_test() ->
|
||||
KeyList = ["key1", "key2", "key3", "key4"],
|
||||
Bloom = create_bloom(KeyList),
|
||||
io:format("Bloom of ~w of length ~w ~n", [Bloom, bit_size(Bloom)]),
|
||||
?assertMatch(true, check_key("key1", Bloom)),
|
||||
?assertMatch(true, check_key("key2", Bloom)),
|
||||
?assertMatch(true, check_key("key3", Bloom)),
|
||||
?assertMatch(true, check_key("key4", Bloom)),
|
||||
?assertMatch(false, check_key("key5", Bloom)).
|
||||
|
||||
bloom_corruption_test() ->
|
||||
KeyList = ["key1", "key2", "key3", "key4"],
|
||||
Bloom = create_bloom(KeyList),
|
||||
Bloom1 = corrupt_bloom(Bloom),
|
||||
?assertMatch(true, check_keys(KeyList, Bloom1)),
|
||||
Bloom2 = corrupt_bloom(Bloom),
|
||||
?assertMatch(true, check_keys(KeyList, Bloom2)),
|
||||
Bloom3 = corrupt_bloom(Bloom),
|
||||
?assertMatch(true, check_keys(KeyList, Bloom3)),
|
||||
Bloom4 = corrupt_bloom(Bloom),
|
||||
?assertMatch(true, check_keys(KeyList, Bloom4)),
|
||||
Bloom5 = corrupt_bloom(Bloom),
|
||||
?assertMatch(true, check_keys(KeyList, Bloom5)),
|
||||
Bloom6 = corrupt_bloom(Bloom),
|
||||
?assertMatch(true, check_keys(KeyList, Bloom6)).
|
||||
|
||||
|
|
@ -159,7 +159,6 @@
|
|||
sft_close/1,
|
||||
sft_clear/1,
|
||||
sft_checkready/1,
|
||||
sft_getfilename/1,
|
||||
sft_setfordelete/2,
|
||||
sft_getmaxsequencenumber/1,
|
||||
generate_randomkeys/1]).
|
||||
|
@ -255,9 +254,6 @@ sft_close(Pid) ->
|
|||
sft_checkready(Pid) ->
|
||||
gen_server:call(Pid, background_complete, infinity).
|
||||
|
||||
sft_getfilename(Pid) ->
|
||||
gen_server:call(Pid, get_filename, infinty).
|
||||
|
||||
sft_getmaxsequencenumber(Pid) ->
|
||||
gen_server:call(Pid, get_maxsqn, infinity).
|
||||
|
||||
|
@ -330,8 +326,6 @@ handle_call(background_complete, _From, State) ->
|
|||
false ->
|
||||
{reply, {error, State#state.background_failure}, State}
|
||||
end;
|
||||
handle_call(get_filename, _From, State) ->
|
||||
{reply, State#state.filename, State};
|
||||
handle_call({set_for_delete, Penciller}, _From, State) ->
|
||||
{reply,
|
||||
ok,
|
||||
|
@ -362,9 +356,7 @@ handle_info(timeout, State) ->
|
|||
end;
|
||||
false ->
|
||||
{noreply, State}
|
||||
end;
|
||||
handle_info(_Info, State) ->
|
||||
{noreply, State}.
|
||||
end.
|
||||
|
||||
terminate(Reason, State) ->
|
||||
io:format("Exit called for reason ~w on filename ~s~n",
|
||||
|
@ -878,18 +870,12 @@ sftwrite_function(finalise,
|
|||
IndexLength:32/integer,
|
||||
FilterLength:32/integer,
|
||||
SummaryLength:32/integer>>),
|
||||
file:close(Handle);
|
||||
sftwrite_function(finalise,
|
||||
{Handle,
|
||||
SlotIndex,
|
||||
SNExtremes,
|
||||
KeyExtremes}) ->
|
||||
{SlotFilters, PointerIndex} = convert_slotindex(SlotIndex),
|
||||
sftwrite_function(finalise,
|
||||
{Handle,
|
||||
{SlotFilters, PointerIndex},
|
||||
SNExtremes,
|
||||
KeyExtremes}).
|
||||
{ok, _Position} = file:position(Handle, bof),
|
||||
ok = file:advise(Handle,
|
||||
BlocksLength + IndexLength,
|
||||
FilterLength,
|
||||
will_need),
|
||||
file:close(Handle).
|
||||
|
||||
%% Level 0 files are of variable (infinite) size to avoid issues with having
|
||||
%% any remainders when flushing from memory
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue