FAdvise
Add fadvise magic to SFT files. Also delete unnecessary rice modeule
This commit is contained in:
parent
938cc0fc16
commit
2d981cb2e7
2 changed files with 7 additions and 304 deletions
|
@ -1,283 +0,0 @@
|
||||||
%% Used for creating fixed-size self-regulating encoded bloom filters
|
|
||||||
%%
|
|
||||||
%% Normally a bloom filter in order to achieve optimium size increases the
|
|
||||||
%% number of hashes as the desired false positive rate increases. There is
|
|
||||||
%% a processing overhead for checking this bloom, both because of the number
|
|
||||||
%% of hash calculations required, and also because of the need to CRC check
|
|
||||||
%% the bloom to ensure a false negative result is not returned due to
|
|
||||||
%% corruption.
|
|
||||||
%%
|
|
||||||
%% A more space efficient bloom can be achieved through the compression of
|
|
||||||
%% bloom filters with less hashes (and in an optimal case a single hash).
|
|
||||||
%% This can be achieved using rice encoding.
|
|
||||||
%%
|
|
||||||
%% Rice-encoding and single hash blooms are used here in order to provide an
|
|
||||||
%% optimally space efficient solution, but also as the processing required to
|
|
||||||
%% support uncompression can be concurrently performing a checksum role.
|
|
||||||
%%
|
|
||||||
%% For this to work, the bloom is divided into 64 parts and a 32-bit hash is
|
|
||||||
%% required. Each hash is placed into one of 64 blooms based on the six least
|
|
||||||
%% significant bits of the hash, and the fmost significant 26-bits are used
|
|
||||||
%% to indicate the bit to be added to the bloom.
|
|
||||||
%%
|
|
||||||
%% The bloom is then created by calculating the differences between the ordered
|
|
||||||
%% elements of the hash list and representing the difference using an exponent
|
|
||||||
%% and a 13-bit remainder i.e.
|
|
||||||
%% 8000 -> 0 11111 01000000
|
|
||||||
%% 10000 -> 10 00000 00010000
|
|
||||||
%% 20000 -> 110 01110 00100000
|
|
||||||
%%
|
|
||||||
%% Each bloom should have approximately 64 differences.
|
|
||||||
%%
|
|
||||||
%% Fronting the bloom is a bloom index, formed first by 16 pairs of 3-byte
|
|
||||||
%% max hash, 2-byte length (bits) - with then each of the encoded bitstrings
|
|
||||||
%% appended. The max hash is the total of all the differences (which should
|
|
||||||
%% be the highest hash in the bloom).
|
|
||||||
%%
|
|
||||||
%% To check a key against the bloom, hash it, take the four least signifcant
|
|
||||||
%% bits and read the start pointer, max hash end pointer from the expected
|
|
||||||
%% positions in the bloom index. Then roll through from the start pointer to
|
|
||||||
%% the end pointer, accumulating each difference. There is a possible match if
|
|
||||||
%% either the accumulator hits the expected hash or the max hash doesn't match
|
|
||||||
%% the final accumulator (to cover if the bloom has been corrupted by a bit
|
|
||||||
%% flip somwhere). A miss is more than twice as expensive (on average) than a
|
|
||||||
%% potential match - but still only requires around 64 integer additions
|
|
||||||
%% and the processing of <100 bytes of data.
|
|
||||||
%%
|
|
||||||
%% For 2048 keys, this takes up <4KB. The false positive rate is 0.000122
|
|
||||||
%% This compares favourably for the equivalent size optimal bloom which
|
|
||||||
%% would require 11 hashes and have a false positive rate of 0.000459.
|
|
||||||
%% Checking with a positive match should take on average about 6 microseconds,
|
|
||||||
%% and a negative match should take around 11 microseconds.
|
|
||||||
%%
|
|
||||||
%% See ../test/rice_test.erl for proving timings and fpr.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
-module(leveled_rice).
|
|
||||||
|
|
||||||
-export([create_bloom/1,
|
|
||||||
check_key/2,
|
|
||||||
check_keys/2]).
|
|
||||||
|
|
||||||
-include_lib("eunit/include/eunit.hrl").
|
|
||||||
|
|
||||||
-define(SLOT_COUNT, 64).
|
|
||||||
-define(MAX_HASH, 16777216).
|
|
||||||
-define(DIVISOR_BITS, 13).
|
|
||||||
-define(DIVISOR, 8092).
|
|
||||||
|
|
||||||
%% Create a bitstring representing the bloom filter from a key list
|
|
||||||
|
|
||||||
create_bloom(KeyList) ->
|
|
||||||
create_bloom(KeyList, ?SLOT_COUNT, ?MAX_HASH).
|
|
||||||
|
|
||||||
create_bloom(KeyList, SlotCount, MaxHash) ->
|
|
||||||
HashLists = array:new(SlotCount, [{default, []}]),
|
|
||||||
OrdHashLists = create_hashlist(KeyList, HashLists, SlotCount, MaxHash),
|
|
||||||
serialise_bloom(OrdHashLists).
|
|
||||||
|
|
||||||
|
|
||||||
%% Checking for a key
|
|
||||||
|
|
||||||
check_keys([], _) ->
|
|
||||||
true;
|
|
||||||
check_keys([Key|Rest], BitStr) ->
|
|
||||||
case check_key(Key, BitStr) of
|
|
||||||
false ->
|
|
||||||
false;
|
|
||||||
true ->
|
|
||||||
check_keys(Rest, BitStr)
|
|
||||||
end.
|
|
||||||
|
|
||||||
check_key(Key, BitStr) ->
|
|
||||||
check_key(Key, BitStr, ?SLOT_COUNT, ?MAX_HASH, ?DIVISOR_BITS, ?DIVISOR).
|
|
||||||
|
|
||||||
check_key(Key, BitStr, SlotCount, MaxHash, Factor, Divisor) ->
|
|
||||||
{Slot, Hash} = get_slothash(Key, MaxHash, SlotCount),
|
|
||||||
{StartPos, Length, TopHash} = find_position(Slot, BitStr, 0, 40 * SlotCount),
|
|
||||||
case BitStr of
|
|
||||||
<<_:StartPos/bitstring, Bloom:Length/bitstring, _/bitstring>> ->
|
|
||||||
check_hash(Hash, Bloom, Factor, Divisor, 0, TopHash);
|
|
||||||
_ ->
|
|
||||||
io:format("Possible corruption of bloom index ~n"),
|
|
||||||
true
|
|
||||||
end.
|
|
||||||
|
|
||||||
find_position(Slot, BloomIndex, Counter, StartPosition) ->
|
|
||||||
<<TopHash:24/integer, Length:16/integer, Rest/bitstring>> = BloomIndex,
|
|
||||||
case Slot of
|
|
||||||
Counter ->
|
|
||||||
{StartPosition, Length, TopHash};
|
|
||||||
_ ->
|
|
||||||
find_position(Slot, Rest, Counter + 1, StartPosition + Length)
|
|
||||||
end.
|
|
||||||
|
|
||||||
|
|
||||||
% Checking for a hash within a bloom
|
|
||||||
|
|
||||||
check_hash(_, <<>>, _, _, Acc, MaxHash) ->
|
|
||||||
case Acc of
|
|
||||||
MaxHash ->
|
|
||||||
false;
|
|
||||||
_ ->
|
|
||||||
io:format("Failure of CRC check on bloom filter~n"),
|
|
||||||
true
|
|
||||||
end;
|
|
||||||
check_hash(HashToCheck, BitStr, Factor, Divisor, Acc, TopHash) ->
|
|
||||||
case findexponent(BitStr) of
|
|
||||||
{ok, Exponent, BitStrTail} ->
|
|
||||||
case findremainder(BitStrTail, Factor) of
|
|
||||||
{ok, Remainder, BitStrTail2} ->
|
|
||||||
NextHash = Acc + Divisor * Exponent + Remainder,
|
|
||||||
case NextHash of
|
|
||||||
HashToCheck ->
|
|
||||||
true;
|
|
||||||
_ ->
|
|
||||||
check_hash(HashToCheck, BitStrTail2, Factor,
|
|
||||||
Divisor, NextHash, TopHash)
|
|
||||||
end;
|
|
||||||
error ->
|
|
||||||
io:format("Failure of CRC check on bloom filter~n"),
|
|
||||||
true
|
|
||||||
end;
|
|
||||||
error ->
|
|
||||||
io:format("Failure of CRC check on bloom filter~n"),
|
|
||||||
true
|
|
||||||
end.
|
|
||||||
|
|
||||||
%% Convert the key list into an array of sorted hash lists
|
|
||||||
|
|
||||||
create_hashlist([], HashLists, _, _) ->
|
|
||||||
HashLists;
|
|
||||||
create_hashlist([HeadKey|Rest], HashLists, SlotCount, MaxHash) ->
|
|
||||||
{Slot, Hash} = get_slothash(HeadKey, MaxHash, SlotCount),
|
|
||||||
HashList = array:get(Slot, HashLists),
|
|
||||||
create_hashlist(Rest,
|
|
||||||
array:set(Slot, lists:usort([Hash|HashList]), HashLists),
|
|
||||||
SlotCount, MaxHash).
|
|
||||||
|
|
||||||
%% Convert an array of hash lists into an serialsed bloom
|
|
||||||
|
|
||||||
serialise_bloom(HashLists) ->
|
|
||||||
SlotCount = array:size(HashLists),
|
|
||||||
serialise_bloom(HashLists, SlotCount, 0, []).
|
|
||||||
|
|
||||||
serialise_bloom(HashLists, SlotCount, Counter, Blooms) ->
|
|
||||||
case Counter of
|
|
||||||
SlotCount ->
|
|
||||||
finalise_bloom(Blooms);
|
|
||||||
_ ->
|
|
||||||
Bloom = serialise_singlebloom(array:get(Counter, HashLists)),
|
|
||||||
serialise_bloom(HashLists, SlotCount, Counter + 1, [Bloom|Blooms])
|
|
||||||
end.
|
|
||||||
|
|
||||||
serialise_singlebloom(HashList) ->
|
|
||||||
serialise_singlebloom(HashList, <<>>, 0, ?DIVISOR, ?DIVISOR_BITS).
|
|
||||||
|
|
||||||
serialise_singlebloom([], BloomStr, TopHash, _, _) ->
|
|
||||||
% io:format("Single bloom created with bloom of ~w and top hash of ~w~n", [BloomStr, TopHash]),
|
|
||||||
{BloomStr, TopHash};
|
|
||||||
serialise_singlebloom([Hash|Rest], BloomStr, TopHash, Divisor, Factor) ->
|
|
||||||
HashGap = Hash - TopHash,
|
|
||||||
Exponent = buildexponent(HashGap div Divisor),
|
|
||||||
Remainder = HashGap rem Divisor,
|
|
||||||
NewBloomStr = <<BloomStr/bitstring, Exponent/bitstring, Remainder:Factor/integer>>,
|
|
||||||
serialise_singlebloom(Rest, NewBloomStr, Hash, Divisor, Factor).
|
|
||||||
|
|
||||||
|
|
||||||
finalise_bloom(Blooms) ->
|
|
||||||
finalise_bloom(Blooms, {<<>>, <<>>}).
|
|
||||||
|
|
||||||
finalise_bloom([], BloomAcc) ->
|
|
||||||
{BloomIndex, BloomStr} = BloomAcc,
|
|
||||||
<<BloomIndex/bitstring, BloomStr/bitstring>>;
|
|
||||||
finalise_bloom([Bloom|Rest], BloomAcc) ->
|
|
||||||
{BloomStr, TopHash} = Bloom,
|
|
||||||
{BloomIndexAcc, BloomStrAcc} = BloomAcc,
|
|
||||||
Length = bit_size(BloomStr),
|
|
||||||
UpdIdx = <<TopHash:24/integer, Length:16/integer, BloomIndexAcc/bitstring>>,
|
|
||||||
% io:format("Adding bloom string of ~w to bloom~n", [BloomStr]),
|
|
||||||
UpdBloomStr = <<BloomStr/bitstring, BloomStrAcc/bitstring>>,
|
|
||||||
finalise_bloom(Rest, {UpdIdx, UpdBloomStr}).
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
buildexponent(Exponent) ->
|
|
||||||
buildexponent(Exponent, <<0:1>>).
|
|
||||||
|
|
||||||
buildexponent(0, OutputBits) ->
|
|
||||||
OutputBits;
|
|
||||||
buildexponent(Exponent, OutputBits) ->
|
|
||||||
buildexponent(Exponent - 1, <<1:1, OutputBits/bitstring>>).
|
|
||||||
|
|
||||||
|
|
||||||
findexponent(BitStr) ->
|
|
||||||
findexponent(BitStr, 0).
|
|
||||||
|
|
||||||
findexponent(<<>>, _) ->
|
|
||||||
error;
|
|
||||||
findexponent(<<H:1/integer, T/bitstring>>, Acc) ->
|
|
||||||
case H of
|
|
||||||
1 -> findexponent(T, Acc + 1);
|
|
||||||
0 -> {ok, Acc, T}
|
|
||||||
end.
|
|
||||||
|
|
||||||
|
|
||||||
findremainder(BitStr, Factor) ->
|
|
||||||
case BitStr of
|
|
||||||
<<Remainder:Factor/integer, BitStrTail/bitstring>> ->
|
|
||||||
{ok, Remainder, BitStrTail};
|
|
||||||
_ ->
|
|
||||||
error
|
|
||||||
end.
|
|
||||||
|
|
||||||
|
|
||||||
get_slothash(Key, MaxHash, SlotCount) ->
|
|
||||||
Hash = erlang:phash2(Key, MaxHash),
|
|
||||||
{Hash rem SlotCount, Hash div SlotCount}.
|
|
||||||
|
|
||||||
|
|
||||||
%%%%%%%%%%%%%%%%
|
|
||||||
% T E S T
|
|
||||||
%%%%%%%%%%%%%%%
|
|
||||||
|
|
||||||
corrupt_bloom(Bloom) ->
|
|
||||||
Length = bit_size(Bloom),
|
|
||||||
Random = random:uniform(Length),
|
|
||||||
<<Part1:Random/bitstring, Bit:1/integer, Rest1/bitstring>> = Bloom,
|
|
||||||
case Bit of
|
|
||||||
1 ->
|
|
||||||
<<Part1/bitstring, 0:1/integer, Rest1/bitstring>>;
|
|
||||||
0 ->
|
|
||||||
<<Part1/bitstring, 1:1/integer, Rest1/bitstring>>
|
|
||||||
end.
|
|
||||||
|
|
||||||
bloom_test() ->
|
|
||||||
KeyList = ["key1", "key2", "key3", "key4"],
|
|
||||||
Bloom = create_bloom(KeyList),
|
|
||||||
io:format("Bloom of ~w of length ~w ~n", [Bloom, bit_size(Bloom)]),
|
|
||||||
?assertMatch(true, check_key("key1", Bloom)),
|
|
||||||
?assertMatch(true, check_key("key2", Bloom)),
|
|
||||||
?assertMatch(true, check_key("key3", Bloom)),
|
|
||||||
?assertMatch(true, check_key("key4", Bloom)),
|
|
||||||
?assertMatch(false, check_key("key5", Bloom)).
|
|
||||||
|
|
||||||
bloom_corruption_test() ->
|
|
||||||
KeyList = ["key1", "key2", "key3", "key4"],
|
|
||||||
Bloom = create_bloom(KeyList),
|
|
||||||
Bloom1 = corrupt_bloom(Bloom),
|
|
||||||
?assertMatch(true, check_keys(KeyList, Bloom1)),
|
|
||||||
Bloom2 = corrupt_bloom(Bloom),
|
|
||||||
?assertMatch(true, check_keys(KeyList, Bloom2)),
|
|
||||||
Bloom3 = corrupt_bloom(Bloom),
|
|
||||||
?assertMatch(true, check_keys(KeyList, Bloom3)),
|
|
||||||
Bloom4 = corrupt_bloom(Bloom),
|
|
||||||
?assertMatch(true, check_keys(KeyList, Bloom4)),
|
|
||||||
Bloom5 = corrupt_bloom(Bloom),
|
|
||||||
?assertMatch(true, check_keys(KeyList, Bloom5)),
|
|
||||||
Bloom6 = corrupt_bloom(Bloom),
|
|
||||||
?assertMatch(true, check_keys(KeyList, Bloom6)).
|
|
||||||
|
|
||||||
|
|
|
@ -159,7 +159,6 @@
|
||||||
sft_close/1,
|
sft_close/1,
|
||||||
sft_clear/1,
|
sft_clear/1,
|
||||||
sft_checkready/1,
|
sft_checkready/1,
|
||||||
sft_getfilename/1,
|
|
||||||
sft_setfordelete/2,
|
sft_setfordelete/2,
|
||||||
sft_getmaxsequencenumber/1,
|
sft_getmaxsequencenumber/1,
|
||||||
generate_randomkeys/1]).
|
generate_randomkeys/1]).
|
||||||
|
@ -255,9 +254,6 @@ sft_close(Pid) ->
|
||||||
sft_checkready(Pid) ->
|
sft_checkready(Pid) ->
|
||||||
gen_server:call(Pid, background_complete, infinity).
|
gen_server:call(Pid, background_complete, infinity).
|
||||||
|
|
||||||
sft_getfilename(Pid) ->
|
|
||||||
gen_server:call(Pid, get_filename, infinty).
|
|
||||||
|
|
||||||
sft_getmaxsequencenumber(Pid) ->
|
sft_getmaxsequencenumber(Pid) ->
|
||||||
gen_server:call(Pid, get_maxsqn, infinity).
|
gen_server:call(Pid, get_maxsqn, infinity).
|
||||||
|
|
||||||
|
@ -330,8 +326,6 @@ handle_call(background_complete, _From, State) ->
|
||||||
false ->
|
false ->
|
||||||
{reply, {error, State#state.background_failure}, State}
|
{reply, {error, State#state.background_failure}, State}
|
||||||
end;
|
end;
|
||||||
handle_call(get_filename, _From, State) ->
|
|
||||||
{reply, State#state.filename, State};
|
|
||||||
handle_call({set_for_delete, Penciller}, _From, State) ->
|
handle_call({set_for_delete, Penciller}, _From, State) ->
|
||||||
{reply,
|
{reply,
|
||||||
ok,
|
ok,
|
||||||
|
@ -362,9 +356,7 @@ handle_info(timeout, State) ->
|
||||||
end;
|
end;
|
||||||
false ->
|
false ->
|
||||||
{noreply, State}
|
{noreply, State}
|
||||||
end;
|
end.
|
||||||
handle_info(_Info, State) ->
|
|
||||||
{noreply, State}.
|
|
||||||
|
|
||||||
terminate(Reason, State) ->
|
terminate(Reason, State) ->
|
||||||
io:format("Exit called for reason ~w on filename ~s~n",
|
io:format("Exit called for reason ~w on filename ~s~n",
|
||||||
|
@ -878,18 +870,12 @@ sftwrite_function(finalise,
|
||||||
IndexLength:32/integer,
|
IndexLength:32/integer,
|
||||||
FilterLength:32/integer,
|
FilterLength:32/integer,
|
||||||
SummaryLength:32/integer>>),
|
SummaryLength:32/integer>>),
|
||||||
file:close(Handle);
|
{ok, _Position} = file:position(Handle, bof),
|
||||||
sftwrite_function(finalise,
|
ok = file:advise(Handle,
|
||||||
{Handle,
|
BlocksLength + IndexLength,
|
||||||
SlotIndex,
|
FilterLength,
|
||||||
SNExtremes,
|
will_need),
|
||||||
KeyExtremes}) ->
|
file:close(Handle).
|
||||||
{SlotFilters, PointerIndex} = convert_slotindex(SlotIndex),
|
|
||||||
sftwrite_function(finalise,
|
|
||||||
{Handle,
|
|
||||||
{SlotFilters, PointerIndex},
|
|
||||||
SNExtremes,
|
|
||||||
KeyExtremes}).
|
|
||||||
|
|
||||||
%% Level 0 files are of variable (infinite) size to avoid issues with having
|
%% Level 0 files are of variable (infinite) size to avoid issues with having
|
||||||
%% any remainders when flushing from memory
|
%% any remainders when flushing from memory
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue